rosetta-cli 2.0.13b1__tar.gz → 2.0.13b3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rosetta_cli-2.0.13b1/rosetta_cli.egg-info → rosetta_cli-2.0.13b3}/PKG-INFO +2 -2
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/pyproject.toml +2 -2
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/ims_publisher.py +18 -48
- rosetta_cli-2.0.13b3/rosetta_cli/ims_utils.py +92 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/ragflow_client.py +258 -92
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3/rosetta_cli.egg-info}/PKG-INFO +2 -2
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli.egg-info/requires.txt +1 -1
- rosetta_cli-2.0.13b1/rosetta_cli/ims_utils.py +0 -28
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/MANIFEST.in +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/README.md +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/env.template +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/__init__.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/__main__.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/cli.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/__init__.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/base_command.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/cleanup_command.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/list_command.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/parse_command.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/publish_command.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/commands/verify_command.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/ims_auth.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/ims_config.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/services/__init__.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/services/auth_service.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/services/dataset_service.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/services/document_data.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/services/document_service.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli/typing_utils.py +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli.egg-info/SOURCES.txt +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli.egg-info/dependency_links.txt +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli.egg-info/entry_points.txt +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/rosetta_cli.egg-info/top_level.txt +0 -0
- {rosetta_cli-2.0.13b1 → rosetta_cli-2.0.13b3}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rosetta-cli
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.13b3
|
|
4
4
|
Summary: Rosetta CLI for publishing knowledge base content to RAGFlow
|
|
5
5
|
Author-email: Rosetta Team <rosetta-support@griddynamics.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -18,7 +18,7 @@ Requires-Python: >=3.12
|
|
|
18
18
|
Description-Content-Type: text/markdown
|
|
19
19
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.0
|
|
20
20
|
Requires-Dist: python-frontmatter<2.0.0,>=1.1.0
|
|
21
|
-
Requires-Dist: ragflow-sdk<0.
|
|
21
|
+
Requires-Dist: ragflow-sdk<0.26.0,>=0.25.1
|
|
22
22
|
Requires-Dist: requests<3.0.0,>=2.31.0
|
|
23
23
|
Requires-Dist: tqdm<5.0.0,>=4.67.0
|
|
24
24
|
Provides-Extra: dev
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "rosetta-cli"
|
|
7
|
-
version = "2.0.
|
|
7
|
+
version = "2.0.13b03"
|
|
8
8
|
description = "Rosetta CLI for publishing knowledge base content to RAGFlow"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.12"
|
|
@@ -23,7 +23,7 @@ classifiers = [
|
|
|
23
23
|
dependencies = [
|
|
24
24
|
"python-dotenv>=1.0.0,<2.0.0",
|
|
25
25
|
"python-frontmatter>=1.1.0,<2.0.0",
|
|
26
|
-
"ragflow-sdk>=0.25.
|
|
26
|
+
"ragflow-sdk>=0.25.1,<0.26.0",
|
|
27
27
|
"requests>=2.31.0,<3.0.0",
|
|
28
28
|
"tqdm>=4.67.0,<5.0.0",
|
|
29
29
|
]
|
|
@@ -388,26 +388,7 @@ class ContentPublisher:
|
|
|
388
388
|
# Add file size for binary files
|
|
389
389
|
if not is_text:
|
|
390
390
|
metadata['file_size'] = len(content)
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
if dry_run:
|
|
394
|
-
print(f"[DRY RUN] Would publish: {metadata.get('doc_title', metadata.get('original_path', file.name))}")
|
|
395
|
-
print(f" Document ID: {ims_doc_id}")
|
|
396
|
-
print(f" Dataset: {dataset_name}")
|
|
397
|
-
print(f" File type: {'text' if is_text else 'binary'}")
|
|
398
|
-
print(f" Metadata: {metadata}")
|
|
399
|
-
if is_text and content_str:
|
|
400
|
-
print(f" Content size: {len(content_str)} characters")
|
|
401
|
-
else:
|
|
402
|
-
print(f" File size: {metadata.get('file_size', 0)} bytes")
|
|
403
|
-
|
|
404
|
-
return PublishResult(
|
|
405
|
-
success=True,
|
|
406
|
-
document_id=ims_doc_id,
|
|
407
|
-
file_path=str(file),
|
|
408
|
-
tags=metadata.get('tags', [])
|
|
409
|
-
)
|
|
410
|
-
|
|
391
|
+
|
|
411
392
|
original_path = metadata.get("original_path", "")
|
|
412
393
|
|
|
413
394
|
# Create DocumentMetadata for RAGFlow
|
|
@@ -426,13 +407,16 @@ class ContentPublisher:
|
|
|
426
407
|
)
|
|
427
408
|
|
|
428
409
|
# Upload to RAGFlow using pre-read content from cache (no re-reading!)
|
|
410
|
+
# In dry_run, upload_document gates each SDK write at the call site,
|
|
411
|
+
# prints the would-be payloads, and returns None (treated as skipped).
|
|
429
412
|
result = self.client.upload_document(
|
|
430
413
|
file_path=file,
|
|
431
414
|
metadata=doc_metadata,
|
|
432
415
|
dataset_name=dataset_name,
|
|
433
416
|
dataset_template=self.dataset_template,
|
|
434
417
|
force=force,
|
|
435
|
-
content=content # Pass pre-read content from cache
|
|
418
|
+
content=content, # Pass pre-read content from cache
|
|
419
|
+
dry_run=dry_run,
|
|
436
420
|
)
|
|
437
421
|
|
|
438
422
|
# None means document was skipped (unchanged)
|
|
@@ -542,10 +526,14 @@ class ContentPublisher:
|
|
|
542
526
|
def _has_content_changed_cached(self, cache: DocumentData) -> bool:
|
|
543
527
|
"""
|
|
544
528
|
Check if document content has changed using pre-calculated hash.
|
|
545
|
-
|
|
529
|
+
|
|
530
|
+
Looks up the existing doc through the client's per-dataset index
|
|
531
|
+
(RAGFlow 0.25.0 ignores server-side metadata_condition, so the index
|
|
532
|
+
is the authoritative lookup).
|
|
533
|
+
|
|
546
534
|
Args:
|
|
547
535
|
cache: DocumentData with pre-calculated hash
|
|
548
|
-
|
|
536
|
+
|
|
549
537
|
Returns:
|
|
550
538
|
True if changed or new, False if unchanged
|
|
551
539
|
"""
|
|
@@ -556,46 +544,28 @@ class ContentPublisher:
|
|
|
556
544
|
'tags': cache.tags,
|
|
557
545
|
'domain': cache.domain
|
|
558
546
|
})
|
|
559
|
-
|
|
547
|
+
|
|
560
548
|
# Get dataset
|
|
561
549
|
dataset = self.client.get_dataset(name=dataset_name)
|
|
562
550
|
if not dataset:
|
|
563
551
|
return True # New dataset = new document
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
"logic": "and",
|
|
568
|
-
"conditions": [{
|
|
569
|
-
"name": "ims_doc_id",
|
|
570
|
-
"comparison_operator": "is",
|
|
571
|
-
"value": cache.ims_doc_id
|
|
572
|
-
}]
|
|
573
|
-
}
|
|
574
|
-
|
|
575
|
-
docs = self.client.list_documents(
|
|
576
|
-
dataset,
|
|
577
|
-
page_size=1,
|
|
578
|
-
metadata_condition=metadata_filter
|
|
579
|
-
)
|
|
580
|
-
|
|
581
|
-
if not docs:
|
|
552
|
+
|
|
553
|
+
existing_doc = self.client.get_existing_doc(dataset, cache.ims_doc_id)
|
|
554
|
+
if existing_doc is None:
|
|
582
555
|
return True # Document doesn't exist
|
|
583
|
-
|
|
584
|
-
# Compare hashes (use pre-calculated hash from cache)
|
|
585
|
-
existing_doc = docs[0]
|
|
556
|
+
|
|
586
557
|
existing_meta = getattr(existing_doc, 'meta_fields', {}) or {}
|
|
587
|
-
|
|
588
558
|
if isinstance(existing_meta, dict):
|
|
589
559
|
existing_hash = existing_meta.get("content_hash")
|
|
590
560
|
else:
|
|
591
561
|
existing_hash = getattr(existing_meta, 'content_hash', None)
|
|
592
|
-
|
|
562
|
+
|
|
593
563
|
if not existing_hash:
|
|
594
564
|
return True # No hash = changed
|
|
595
565
|
|
|
596
566
|
# Compare: cache.content_hash was already calculated in DocumentData
|
|
597
567
|
return cache.content_hash != str(existing_hash)
|
|
598
|
-
|
|
568
|
+
|
|
599
569
|
except Exception as e:
|
|
600
570
|
print(f" Warning: Could not check existing document: {e}")
|
|
601
571
|
return True # Assume changed on error
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Shared path utilities for Rosetta CLI."""
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, TypeVar
|
|
7
|
+
|
|
8
|
+
_T = TypeVar("_T")
|
|
9
|
+
|
|
10
|
+
_TRANSIENT_RAGFLOW = (
|
|
11
|
+
"The dataset doesn't own the document",
|
|
12
|
+
"Documents not found",
|
|
13
|
+
"mapper_parsing_exception",
|
|
14
|
+
"Failed to update metadata",
|
|
15
|
+
"timed out", "timeout",
|
|
16
|
+
"Connection aborted", "Connection refused", "Connection reset",
|
|
17
|
+
"status 5",
|
|
18
|
+
)
|
|
19
|
+
_PERMANENT_RAGFLOW = (
|
|
20
|
+
"The type is not supported",
|
|
21
|
+
"format_invalid",
|
|
22
|
+
"Invalid API key",
|
|
23
|
+
"Insufficient permissions",
|
|
24
|
+
"lacks permission",
|
|
25
|
+
"You don't own",
|
|
26
|
+
"meta_fields must be a dictionary",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def is_transient_ragflow(exc: BaseException) -> bool:
|
|
31
|
+
"""Classify a RAGFlow error as transient (retry) or permanent (do not retry).
|
|
32
|
+
|
|
33
|
+
Permanent substrings win — a message containing both never retries.
|
|
34
|
+
"""
|
|
35
|
+
msg = str(exc)
|
|
36
|
+
if any(s in msg for s in _PERMANENT_RAGFLOW):
|
|
37
|
+
return False
|
|
38
|
+
return any(s in msg for s in _TRANSIENT_RAGFLOW)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def retry_call(
|
|
42
|
+
fn: Callable[[], _T],
|
|
43
|
+
*,
|
|
44
|
+
attempts: int = 3,
|
|
45
|
+
jitter_ms_range: tuple[int, int] = (150, 250),
|
|
46
|
+
retry_on: Callable[[BaseException], bool] = is_transient_ragflow,
|
|
47
|
+
label: str = "",
|
|
48
|
+
) -> _T:
|
|
49
|
+
"""Call ``fn`` up to ``attempts`` times; sleep flat-random ms between attempts on retry_on(exc).
|
|
50
|
+
|
|
51
|
+
Re-raises the last exception when attempts are exhausted or ``retry_on`` returns False.
|
|
52
|
+
"""
|
|
53
|
+
if attempts < 1:
|
|
54
|
+
raise ValueError("attempts must be >= 1")
|
|
55
|
+
last: BaseException | None = None
|
|
56
|
+
for n in range(1, attempts + 1):
|
|
57
|
+
try:
|
|
58
|
+
return fn()
|
|
59
|
+
except BaseException as exc:
|
|
60
|
+
last = exc
|
|
61
|
+
if n >= attempts or not retry_on(exc):
|
|
62
|
+
raise
|
|
63
|
+
jitter = random.randint(jitter_ms_range[0], jitter_ms_range[1])
|
|
64
|
+
print(f" ↻ retry {n}/{attempts - 1} for {label} after {jitter}ms: {str(exc)[:120]}")
|
|
65
|
+
time.sleep(jitter / 1000.0)
|
|
66
|
+
assert last is not None
|
|
67
|
+
raise last
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def resolve_workspace_root(path: Path) -> Path:
|
|
71
|
+
"""Resolve the workspace root for a publish target.
|
|
72
|
+
|
|
73
|
+
Preference order:
|
|
74
|
+
1. Parent of the topmost `instructions/` directory in the target path.
|
|
75
|
+
2. Nearest ancestor containing `.git`.
|
|
76
|
+
3. The target directory itself, or the parent for a file target.
|
|
77
|
+
"""
|
|
78
|
+
resolved = path.resolve()
|
|
79
|
+
container = resolved if resolved.is_dir() else resolved.parent
|
|
80
|
+
|
|
81
|
+
parts = container.parts
|
|
82
|
+
for index, part in enumerate(parts):
|
|
83
|
+
if part == "instructions" and index > 0:
|
|
84
|
+
return Path(*parts[:index])
|
|
85
|
+
|
|
86
|
+
current = container
|
|
87
|
+
while current != current.parent:
|
|
88
|
+
if (current / ".git").exists():
|
|
89
|
+
return current
|
|
90
|
+
current = current.parent
|
|
91
|
+
|
|
92
|
+
return container
|
|
@@ -22,9 +22,32 @@ from typing import Any, Dict, List, Optional, cast
|
|
|
22
22
|
from ragflow_sdk import RAGFlow
|
|
23
23
|
from ragflow_sdk.modules.dataset import DataSet
|
|
24
24
|
from ragflow_sdk.modules.document import Document
|
|
25
|
+
from .ims_utils import retry_call
|
|
25
26
|
from .typing_utils import DatasetLike, DocumentLike, JsonDict
|
|
26
27
|
|
|
27
28
|
|
|
29
|
+
class _Timer:
|
|
30
|
+
"""Context manager that prints elapsed time for an SDK call."""
|
|
31
|
+
|
|
32
|
+
def __init__(self, label: str) -> None:
|
|
33
|
+
self.label = label
|
|
34
|
+
self.t0 = 0.0
|
|
35
|
+
|
|
36
|
+
def __enter__(self) -> "_Timer":
|
|
37
|
+
self.t0 = time.perf_counter()
|
|
38
|
+
return self
|
|
39
|
+
|
|
40
|
+
def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
|
|
41
|
+
elapsed_ms = (time.perf_counter() - self.t0) * 1000.0
|
|
42
|
+
tag = "FAILED" if exc_type else "ok"
|
|
43
|
+
print(f" ⏱️ {self.label}: {elapsed_ms:.0f}ms [{tag}]")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _timed(label: str) -> _Timer:
|
|
47
|
+
"""Wrap an SDK call to print its elapsed wall time."""
|
|
48
|
+
return _Timer(label)
|
|
49
|
+
|
|
50
|
+
|
|
28
51
|
|
|
29
52
|
@dataclass
|
|
30
53
|
class DocumentMetadata:
|
|
@@ -128,6 +151,88 @@ class RAGFlowClient:
|
|
|
128
151
|
# Initialize RAGFlow SDK client
|
|
129
152
|
self._client = RAGFlow(api_key=api_key, base_url=base_url, version=version)
|
|
130
153
|
|
|
154
|
+
# Per-dataset index of {ims_doc_id: doc}, lazily built and reused across
|
|
155
|
+
# the publish session. RAGFlow 0.25.0 ignores `metadata_condition` server-
|
|
156
|
+
# side, so we cannot rely on filtered list_documents to find an existing
|
|
157
|
+
# doc by ims_doc_id; we list once and index in memory instead.
|
|
158
|
+
self._doc_index_by_dataset: dict[str, dict[str, DocumentLike]] = {}
|
|
159
|
+
# Per-client dataset lookup cache. Publishing resolves the same release
|
|
160
|
+
# dataset for every file; keep those list_datasets calls in-process.
|
|
161
|
+
self._dataset_by_id: dict[str, DataSet] = {}
|
|
162
|
+
self._dataset_by_name: dict[str, DataSet] = {}
|
|
163
|
+
|
|
164
|
+
def _clear_dataset_cache(self) -> None:
|
|
165
|
+
self._dataset_by_id.clear()
|
|
166
|
+
self._dataset_by_name.clear()
|
|
167
|
+
|
|
168
|
+
def _remember_dataset(self, dataset: DataSet) -> DataSet:
|
|
169
|
+
dataset_id = getattr(dataset, "id", None)
|
|
170
|
+
dataset_name = getattr(dataset, "name", None)
|
|
171
|
+
if dataset_id:
|
|
172
|
+
self._dataset_by_id[str(dataset_id)] = dataset
|
|
173
|
+
if dataset_name:
|
|
174
|
+
self._dataset_by_name[str(dataset_name)] = dataset
|
|
175
|
+
return dataset
|
|
176
|
+
|
|
177
|
+
def _get_doc_index(self, dataset: DatasetLike) -> dict[str, DocumentLike]:
|
|
178
|
+
"""Return a {ims_doc_id: doc} index for the dataset, building it once.
|
|
179
|
+
|
|
180
|
+
Reuses across calls in the same client. Mutate via `_remember_doc` after
|
|
181
|
+
upload and `_forget_doc` after delete to keep it consistent.
|
|
182
|
+
|
|
183
|
+
Tolerates ownership/permission errors from list_documents — RAGFlow can
|
|
184
|
+
return those for team-shared datasets where the API key holder is not the
|
|
185
|
+
owner. We treat them as "no existing docs visible" so the publish flow
|
|
186
|
+
proceeds with fresh uploads.
|
|
187
|
+
"""
|
|
188
|
+
ds_id = str(dataset.id)
|
|
189
|
+
cached = self._doc_index_by_dataset.get(ds_id)
|
|
190
|
+
if cached is not None:
|
|
191
|
+
return cached
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
all_docs = self.list_documents(dataset, page_size=self.page_size)
|
|
195
|
+
except RAGFlowClientError as e:
|
|
196
|
+
msg = str(e).lower()
|
|
197
|
+
if (
|
|
198
|
+
"you don't own" in msg
|
|
199
|
+
or "you do not own" in msg
|
|
200
|
+
or "lacks permission" in msg
|
|
201
|
+
):
|
|
202
|
+
self._doc_index_by_dataset[ds_id] = {}
|
|
203
|
+
return {}
|
|
204
|
+
raise
|
|
205
|
+
|
|
206
|
+
index: dict[str, DocumentLike] = {}
|
|
207
|
+
for doc in all_docs:
|
|
208
|
+
meta = getattr(doc, "meta_fields", {}) or {}
|
|
209
|
+
ims_doc_id = (
|
|
210
|
+
meta.get("ims_doc_id") if isinstance(meta, dict)
|
|
211
|
+
else getattr(meta, "ims_doc_id", None)
|
|
212
|
+
)
|
|
213
|
+
if ims_doc_id:
|
|
214
|
+
index[str(ims_doc_id)] = doc
|
|
215
|
+
self._doc_index_by_dataset[ds_id] = index
|
|
216
|
+
print(f" 📚 doc-index built for dataset {ds_id}: {len(index)} indexed of {len(all_docs)} listed")
|
|
217
|
+
return index
|
|
218
|
+
|
|
219
|
+
def get_existing_doc(self, dataset: DatasetLike, ims_doc_id: str) -> DocumentLike | None:
|
|
220
|
+
"""Return the doc with this ims_doc_id, or None.
|
|
221
|
+
|
|
222
|
+
Uses the per-dataset index. Required because RAGFlow 0.25.0 ignores
|
|
223
|
+
metadata_condition filters server-side.
|
|
224
|
+
"""
|
|
225
|
+
return self._get_doc_index(dataset).get(str(ims_doc_id))
|
|
226
|
+
|
|
227
|
+
def _remember_doc(self, dataset: DatasetLike, ims_doc_id: str, doc: DocumentLike) -> None:
|
|
228
|
+
ds_id = str(dataset.id)
|
|
229
|
+
self._doc_index_by_dataset.setdefault(ds_id, {})[str(ims_doc_id)] = doc
|
|
230
|
+
|
|
231
|
+
def _forget_doc(self, dataset: DatasetLike, ims_doc_id: str) -> None:
|
|
232
|
+
ds_id = str(dataset.id)
|
|
233
|
+
if ds_id in self._doc_index_by_dataset:
|
|
234
|
+
self._doc_index_by_dataset[ds_id].pop(str(ims_doc_id), None)
|
|
235
|
+
|
|
131
236
|
def _handle_response_error(self, response: Any, operation: str) -> None:
|
|
132
237
|
"""
|
|
133
238
|
Handle API response errors uniformly.
|
|
@@ -172,8 +277,9 @@ class RAGFlowClient:
|
|
|
172
277
|
embedding_model: str | None = None,
|
|
173
278
|
permission: str = "team",
|
|
174
279
|
chunk_method: str | None = None,
|
|
175
|
-
parser_config: JsonDict | None = None
|
|
176
|
-
|
|
280
|
+
parser_config: JsonDict | None = None,
|
|
281
|
+
dry_run: bool = False
|
|
282
|
+
) -> DataSet | None:
|
|
177
283
|
"""
|
|
178
284
|
Create a new dataset.
|
|
179
285
|
|
|
@@ -212,11 +318,17 @@ class RAGFlowClient:
|
|
|
212
318
|
# Convert parser_config dict to DataSet.ParserConfig object if needed
|
|
213
319
|
if parser_cfg:
|
|
214
320
|
kwargs["parser_config"] = DataSet.ParserConfig(self._client, parser_cfg)
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
321
|
+
|
|
322
|
+
if dry_run:
|
|
323
|
+
print(f" [DRY RUN] would self._client.create_dataset({json.dumps({k: v for k, v in kwargs.items() if k != 'parser_config'})})")
|
|
324
|
+
return None
|
|
325
|
+
|
|
326
|
+
with _timed(f"create_dataset(name={name})"):
|
|
327
|
+
dataset = self._client.create_dataset(**kwargs)
|
|
328
|
+
|
|
329
|
+
self._clear_dataset_cache()
|
|
330
|
+
return self._remember_dataset(cast(DataSet, dataset))
|
|
331
|
+
|
|
220
332
|
except Exception as e:
|
|
221
333
|
raise RAGFlowClientError(f"Failed to create dataset '{name}': {str(e)}")
|
|
222
334
|
|
|
@@ -247,15 +359,16 @@ class RAGFlowClient:
|
|
|
247
359
|
RAGFlowClientError: If listing fails
|
|
248
360
|
"""
|
|
249
361
|
try:
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
362
|
+
with _timed(f"list_datasets(name={name},id={id})"):
|
|
363
|
+
datasets = self._client.list_datasets(
|
|
364
|
+
page=page,
|
|
365
|
+
page_size=page_size,
|
|
366
|
+
orderby=orderby,
|
|
367
|
+
desc=desc,
|
|
368
|
+
id=id,
|
|
369
|
+
name=name
|
|
370
|
+
)
|
|
371
|
+
|
|
259
372
|
return cast(list[DataSet], datasets)
|
|
260
373
|
|
|
261
374
|
except Exception as e:
|
|
@@ -281,18 +394,28 @@ class RAGFlowClient:
|
|
|
281
394
|
"""
|
|
282
395
|
try:
|
|
283
396
|
if id:
|
|
397
|
+
cached = self._dataset_by_id.get(str(id))
|
|
398
|
+
if cached is not None:
|
|
399
|
+
return cached
|
|
400
|
+
|
|
284
401
|
# Filter by ID
|
|
285
|
-
|
|
402
|
+
with _timed(f"list_datasets(id={id})"):
|
|
403
|
+
datasets = self._client.list_datasets(id=id, page_size=1)
|
|
286
404
|
elif name:
|
|
405
|
+
cached = self._dataset_by_name.get(str(name))
|
|
406
|
+
if cached is not None:
|
|
407
|
+
return cached
|
|
408
|
+
|
|
287
409
|
# Filter by name (RAGFlow does substring, we verify exact match)
|
|
288
|
-
|
|
410
|
+
with _timed(f"list_datasets(name={name})"):
|
|
411
|
+
datasets = self._client.list_datasets(name=name, page_size=10)
|
|
289
412
|
# Filter for exact match
|
|
290
413
|
datasets = [ds for ds in datasets if ds.name == name]
|
|
291
414
|
else:
|
|
292
415
|
return None
|
|
293
416
|
|
|
294
417
|
if datasets and len(datasets) > 0:
|
|
295
|
-
return datasets[0]
|
|
418
|
+
return self._remember_dataset(cast(DataSet, datasets[0]))
|
|
296
419
|
return None
|
|
297
420
|
|
|
298
421
|
except Exception as e:
|
|
@@ -302,39 +425,46 @@ class RAGFlowClient:
|
|
|
302
425
|
return None
|
|
303
426
|
raise RAGFlowClientError(f"Failed to get dataset: {str(e)}")
|
|
304
427
|
|
|
305
|
-
def delete_datasets(self, ids: list[str]) -> None:
|
|
428
|
+
def delete_datasets(self, ids: list[str], dry_run: bool = False) -> None:
|
|
306
429
|
"""
|
|
307
430
|
Delete datasets by IDs.
|
|
308
|
-
|
|
431
|
+
|
|
309
432
|
Args:
|
|
310
433
|
ids: List of dataset IDs to delete
|
|
311
|
-
|
|
434
|
+
dry_run: If True, print would-be call and skip the SDK write.
|
|
435
|
+
|
|
312
436
|
Raises:
|
|
313
437
|
RAGFlowClientError: If deletion fails
|
|
314
438
|
"""
|
|
315
439
|
try:
|
|
316
|
-
|
|
317
|
-
|
|
440
|
+
if dry_run:
|
|
441
|
+
print(f" [DRY RUN] would self._client.delete_datasets(ids={json.dumps(ids)})")
|
|
442
|
+
return
|
|
443
|
+
with _timed(f"delete_datasets(n={len(ids)})"):
|
|
444
|
+
self._client.delete_datasets(ids=ids)
|
|
445
|
+
self._clear_dataset_cache()
|
|
446
|
+
|
|
318
447
|
except Exception as e:
|
|
319
448
|
raise RAGFlowClientError(f"Failed to delete datasets: {str(e)}")
|
|
320
449
|
|
|
321
|
-
def _ensure_dataset(self, name: str, description: str = "") -> DataSet:
|
|
450
|
+
def _ensure_dataset(self, name: str, description: str = "", dry_run: bool = False) -> DataSet | None:
|
|
322
451
|
"""
|
|
323
452
|
Get dataset if exists, create if not.
|
|
324
|
-
|
|
453
|
+
|
|
325
454
|
Args:
|
|
326
455
|
name: Dataset name
|
|
327
456
|
description: Dataset description (used if creating)
|
|
328
|
-
|
|
457
|
+
dry_run: If True and dataset is missing, print would-be create and return None.
|
|
458
|
+
|
|
329
459
|
Returns:
|
|
330
|
-
DataSet object
|
|
460
|
+
DataSet object, or None when dry_run skips a needed create.
|
|
331
461
|
"""
|
|
332
462
|
dataset = self.get_dataset(name=name)
|
|
333
463
|
if dataset is not None:
|
|
334
464
|
return dataset
|
|
335
|
-
|
|
336
|
-
# Dataset doesn't exist, create it
|
|
337
|
-
return self.create_dataset(name, description)
|
|
465
|
+
|
|
466
|
+
# Dataset doesn't exist, create it (gated by dry_run)
|
|
467
|
+
return self.create_dataset(name, description, dry_run=dry_run)
|
|
338
468
|
|
|
339
469
|
def _resolve_dataset_name(self, template: str, release: str | None) -> str:
|
|
340
470
|
"""
|
|
@@ -379,7 +509,8 @@ class RAGFlowClient:
|
|
|
379
509
|
dataset_name: str | None = None,
|
|
380
510
|
dataset_template: str = "aia-{release}",
|
|
381
511
|
force: bool = False,
|
|
382
|
-
content: bytes | None = None # NEW: Pre-read content from cache
|
|
512
|
+
content: bytes | None = None, # NEW: Pre-read content from cache
|
|
513
|
+
dry_run: bool = False
|
|
383
514
|
) -> tuple[DocumentLike, str] | None:
|
|
384
515
|
"""
|
|
385
516
|
Upload document with upsert semantics and change detection.
|
|
@@ -438,11 +569,20 @@ class RAGFlowClient:
|
|
|
438
569
|
metadata.release
|
|
439
570
|
)
|
|
440
571
|
|
|
441
|
-
# Ensure dataset exists
|
|
572
|
+
# Ensure dataset exists (dry_run gates the underlying create_dataset call)
|
|
442
573
|
dataset = self._ensure_dataset(
|
|
443
574
|
resolved_name,
|
|
444
|
-
f"IMS Knowledge - Release {metadata.release}" if metadata.release else "IMS Knowledge"
|
|
575
|
+
f"IMS Knowledge - Release {metadata.release}" if metadata.release else "IMS Knowledge",
|
|
576
|
+
dry_run=dry_run,
|
|
445
577
|
)
|
|
578
|
+
if dataset is None:
|
|
579
|
+
# dry_run path where dataset would have been created but wasn't.
|
|
580
|
+
# Return a sentinel so the publisher reports "would-publish", not "skipped".
|
|
581
|
+
# dataset.id is unavailable here; use resolved_name as the dataset identifier.
|
|
582
|
+
# Publisher only consumes the dataset_id for parsing (guarded by not dry_run).
|
|
583
|
+
print(f" [DRY RUN] dataset '{resolved_name}' missing; would be created.")
|
|
584
|
+
from types import SimpleNamespace
|
|
585
|
+
return (cast(DocumentLike, SimpleNamespace(id=metadata.ims_doc_id)), resolved_name)
|
|
446
586
|
|
|
447
587
|
# Build display name from normalized doc title when available.
|
|
448
588
|
# For R1, doc_title is filename; for R2, doc_title is logical path.
|
|
@@ -450,37 +590,11 @@ class RAGFlowClient:
|
|
|
450
590
|
filename = metadata.doc_title or (file_path.name if file_path else "")
|
|
451
591
|
title = self._build_title_with_tags(metadata.tags, filename)
|
|
452
592
|
|
|
453
|
-
# Check if document exists by
|
|
593
|
+
# Check if document exists by ims_doc_id via the per-dataset index.
|
|
594
|
+
# RAGFlow 0.25.0 ignores metadata_condition server-side, so the index
|
|
595
|
+
# (built from a single list-all per dataset) is the authoritative lookup.
|
|
454
596
|
start_time = time.time()
|
|
455
|
-
|
|
456
|
-
# Use server-side metadata filtering to find document by ims_doc_id.
|
|
457
|
-
# RAGFlow may return ownership-style errors when the filtered lookup
|
|
458
|
-
# misses a document in team-shared datasets; treat that as "not found".
|
|
459
|
-
try:
|
|
460
|
-
existing_docs = self.list_documents(
|
|
461
|
-
dataset=dataset,
|
|
462
|
-
metadata_condition={
|
|
463
|
-
"logic": "and",
|
|
464
|
-
"conditions": [{
|
|
465
|
-
"name": "ims_doc_id",
|
|
466
|
-
"comparison_operator": "is",
|
|
467
|
-
"value": metadata.ims_doc_id
|
|
468
|
-
}]
|
|
469
|
-
},
|
|
470
|
-
page_size=1
|
|
471
|
-
)
|
|
472
|
-
except RAGFlowClientError as e:
|
|
473
|
-
msg = str(e).lower()
|
|
474
|
-
if (
|
|
475
|
-
"you don't own" in msg
|
|
476
|
-
or "you do not own" in msg
|
|
477
|
-
or "lacks permission" in msg
|
|
478
|
-
):
|
|
479
|
-
existing_docs = []
|
|
480
|
-
else:
|
|
481
|
-
raise
|
|
482
|
-
|
|
483
|
-
existing_doc = existing_docs[0] if existing_docs else None
|
|
597
|
+
existing_doc = self.get_existing_doc(dataset, metadata.ims_doc_id)
|
|
484
598
|
|
|
485
599
|
if existing_doc:
|
|
486
600
|
# Check if content changed by comparing hashes
|
|
@@ -500,24 +614,36 @@ class RAGFlowClient:
|
|
|
500
614
|
return None
|
|
501
615
|
|
|
502
616
|
# Content changed, delete old version
|
|
503
|
-
|
|
617
|
+
if dry_run:
|
|
618
|
+
print(f" [DRY RUN] would dataset.delete_documents({json.dumps([existing_doc.id])})")
|
|
619
|
+
else:
|
|
620
|
+
with _timed(f"dataset.delete_documents(id={existing_doc.id})"):
|
|
621
|
+
dataset.delete_documents([existing_doc.id])
|
|
622
|
+
self._forget_doc(dataset, metadata.ims_doc_id)
|
|
504
623
|
print(f" 🔄 Updating: {title}")
|
|
505
624
|
else:
|
|
506
625
|
print(f" ⬆️ Uploading: {title}")
|
|
507
|
-
|
|
626
|
+
|
|
508
627
|
# Upload document
|
|
509
628
|
try:
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
"blob"
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
629
|
+
upload_payload = [{"display_name": title, "blob_bytes": len(content)}]
|
|
630
|
+
if dry_run:
|
|
631
|
+
print(f" [DRY RUN] would dataset.upload_documents({json.dumps(upload_payload)}) (blob_bytes shown instead of raw blob)")
|
|
632
|
+
doc = None
|
|
633
|
+
else:
|
|
634
|
+
with _timed(f"dataset.upload_documents(bytes={len(content)})"):
|
|
635
|
+
documents = dataset.upload_documents([{
|
|
636
|
+
"display_name": title,
|
|
637
|
+
"blob": content
|
|
638
|
+
}])
|
|
639
|
+
if not documents:
|
|
640
|
+
raise RAGFlowClientError("Upload returned no documents")
|
|
641
|
+
doc = documents[0]
|
|
519
642
|
|
|
520
|
-
#
|
|
643
|
+
# RAGFlow 0.25.x rejects None and dict values in meta_fields
|
|
644
|
+
# (validate_document_meta_fields). Drop None entries; JSON-stringify
|
|
645
|
+
# the frontmatter dict in place under the same key so the validator
|
|
646
|
+
# accepts it. The MCP read side already json.loads it on the way back.
|
|
521
647
|
meta_fields: JsonDict = {
|
|
522
648
|
"ims_doc_id": metadata.ims_doc_id,
|
|
523
649
|
"tags": metadata.tags,
|
|
@@ -525,18 +651,47 @@ class RAGFlowClient:
|
|
|
525
651
|
"release": metadata.release,
|
|
526
652
|
"content_hash": metadata.content_hash,
|
|
527
653
|
"original_path": metadata.original_path,
|
|
528
|
-
"sort_order": metadata.sort_order,
|
|
529
654
|
"doc_title": metadata.doc_title,
|
|
530
655
|
}
|
|
656
|
+
if metadata.sort_order is not None:
|
|
657
|
+
meta_fields["sort_order"] = metadata.sort_order
|
|
531
658
|
if metadata.line_count is not None:
|
|
532
659
|
meta_fields["line_count"] = metadata.line_count
|
|
533
660
|
if metadata.resource_path is not None:
|
|
534
661
|
meta_fields["resource_path"] = metadata.resource_path
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
662
|
+
if metadata.frontmatter is not None:
|
|
663
|
+
# Stored under "fm" (not "frontmatter") because the per-tenant
|
|
664
|
+
# ES doc-meta index (ragflow_doc_meta_{tenant_id}) commits a
|
|
665
|
+
# sticky "object" dynamic mapping for any key that was first
|
|
666
|
+
# written as a dict. Dropping datasets does NOT reset the index;
|
|
667
|
+
# the "frontmatter" key is permanently typed as object in
|
|
668
|
+
# existing deployments, so any string write to it is rejected.
|
|
669
|
+
# Using "fm" gets a fresh dynamic mapping as "text/keyword".
|
|
670
|
+
# MCP readers fall back to the legacy "frontmatter" key so old
|
|
671
|
+
# documents written before this rename remain readable.
|
|
672
|
+
meta_fields["fm"] = json.dumps(
|
|
673
|
+
metadata.frontmatter,
|
|
674
|
+
sort_keys=True,
|
|
675
|
+
ensure_ascii=False,
|
|
676
|
+
default=str,
|
|
677
|
+
)
|
|
538
678
|
|
|
539
|
-
|
|
679
|
+
if dry_run:
|
|
680
|
+
print(f" [DRY RUN] would doc.update({json.dumps({'meta_fields': meta_fields}, ensure_ascii=False, default=str)})")
|
|
681
|
+
elapsed = time.time() - start_time
|
|
682
|
+
print(f" [DRY RUN] would Done ({elapsed:.2f}s): {title}")
|
|
683
|
+
# Return a sentinel doc so the publisher reports this file as
|
|
684
|
+
# "would-publish" rather than "skipped (unchanged)".
|
|
685
|
+
from types import SimpleNamespace
|
|
686
|
+
return (cast(DocumentLike, SimpleNamespace(id=metadata.ims_doc_id)), dataset.id)
|
|
687
|
+
assert doc is not None
|
|
688
|
+
def _do_update() -> None:
|
|
689
|
+
with _timed(f"doc.update(id={doc.id})"):
|
|
690
|
+
doc.update({"meta_fields": meta_fields})
|
|
691
|
+
retry_call(
|
|
692
|
+
_do_update,
|
|
693
|
+
label=f"doc.update({doc.id})",
|
|
694
|
+
)
|
|
540
695
|
# SDK update() does not echo meta_fields back in the PUT response;
|
|
541
696
|
# re-fetch to get the actual stored state.
|
|
542
697
|
try:
|
|
@@ -545,10 +700,10 @@ class RAGFlowClient:
|
|
|
545
700
|
if updated_meta:
|
|
546
701
|
if isinstance(updated_meta, dict):
|
|
547
702
|
meta_tags = updated_meta.get('tags', [])
|
|
548
|
-
meta_fm = updated_meta.get('frontmatter')
|
|
703
|
+
meta_fm = updated_meta.get('fm') or updated_meta.get('frontmatter')
|
|
549
704
|
else:
|
|
550
705
|
meta_tags = getattr(updated_meta, 'tags', []) or []
|
|
551
|
-
meta_fm = getattr(updated_meta, 'frontmatter', None)
|
|
706
|
+
meta_fm = getattr(updated_meta, 'fm', None) or getattr(updated_meta, 'frontmatter', None)
|
|
552
707
|
tag_count = len(meta_tags) if isinstance(meta_tags, list) else 0
|
|
553
708
|
print(f" ✅ Metadata set: {tag_count} tags, frontmatter={'yes' if meta_fm else 'no'}")
|
|
554
709
|
else:
|
|
@@ -558,7 +713,10 @@ class RAGFlowClient:
|
|
|
558
713
|
|
|
559
714
|
elapsed = time.time() - start_time
|
|
560
715
|
print(f" ✅ Done ({elapsed:.2f}s): {title}")
|
|
561
|
-
|
|
716
|
+
|
|
717
|
+
# Remember the new doc so subsequent lookups in this session find it.
|
|
718
|
+
self._remember_doc(dataset, metadata.ims_doc_id, cast(DocumentLike, doc))
|
|
719
|
+
|
|
562
720
|
# Return doc object and dataset ID for parsing
|
|
563
721
|
# doc.id is RAGFlow's internal document ID needed for parsing
|
|
564
722
|
return (cast(DocumentLike, doc), dataset.id)
|
|
@@ -566,30 +724,36 @@ class RAGFlowClient:
|
|
|
566
724
|
except Exception as e:
|
|
567
725
|
raise RAGFlowClientError(f"Failed to upload document '{title}': {str(e)}")
|
|
568
726
|
|
|
569
|
-
def trigger_parse(self, dataset_id: str, document_ids: list[str]) -> None:
|
|
727
|
+
def trigger_parse(self, dataset_id: str, document_ids: list[str], dry_run: bool = False) -> None:
|
|
570
728
|
"""
|
|
571
729
|
Trigger async parsing for documents.
|
|
572
|
-
|
|
730
|
+
|
|
573
731
|
Args:
|
|
574
732
|
dataset_id: Dataset ID containing documents
|
|
575
733
|
document_ids: List of document IDs to parse
|
|
576
|
-
|
|
734
|
+
dry_run: If True, print would-be call and skip the SDK write.
|
|
735
|
+
|
|
577
736
|
Raises:
|
|
578
737
|
RAGFlowClientError: If parsing trigger fails
|
|
579
738
|
"""
|
|
739
|
+
if dry_run:
|
|
740
|
+
print(f" [DRY RUN] would dataset({json.dumps(dataset_id)}).async_parse_documents({json.dumps(document_ids)})")
|
|
741
|
+
return
|
|
580
742
|
dataset = self.get_dataset(id=dataset_id)
|
|
581
743
|
if not dataset:
|
|
582
744
|
raise NotFoundError(f"Dataset not found: {dataset_id}")
|
|
583
|
-
|
|
745
|
+
|
|
584
746
|
try:
|
|
585
|
-
|
|
747
|
+
with _timed(f"async_parse_documents(n={len(document_ids)})"):
|
|
748
|
+
dataset.async_parse_documents(document_ids)
|
|
586
749
|
except Exception as e:
|
|
587
750
|
raise RAGFlowClientError(f"Failed to trigger parsing: {str(e)}")
|
|
588
751
|
|
|
589
752
|
def parse_documents_batch(
|
|
590
753
|
self,
|
|
591
754
|
documents: list[JsonDict],
|
|
592
|
-
silent: bool = False
|
|
755
|
+
silent: bool = False,
|
|
756
|
+
dry_run: bool = False
|
|
593
757
|
) -> dict[str, list[str]]:
|
|
594
758
|
"""
|
|
595
759
|
Trigger parsing for multiple documents across datasets.
|
|
@@ -636,7 +800,7 @@ class RAGFlowClient:
|
|
|
636
800
|
print(f" → Document IDs: {doc_ids[:3]}{'...' if len(doc_ids) > 3 else ''}")
|
|
637
801
|
|
|
638
802
|
try:
|
|
639
|
-
self.trigger_parse(dataset_id, doc_ids)
|
|
803
|
+
self.trigger_parse(dataset_id, doc_ids, dry_run=dry_run)
|
|
640
804
|
success_datasets.append(dataset_id)
|
|
641
805
|
except Exception as e:
|
|
642
806
|
failed_datasets.append(dataset_id)
|
|
@@ -797,7 +961,9 @@ class RAGFlowClient:
|
|
|
797
961
|
|
|
798
962
|
# Bypass SDK and call HTTP API directly
|
|
799
963
|
# SDK doesn't support run, suffix, metadata_condition parameters
|
|
800
|
-
|
|
964
|
+
cond_keys = sorted(params.keys())
|
|
965
|
+
with _timed(f"list_documents({','.join(cond_keys)})"):
|
|
966
|
+
res = dataset.get(f"/datasets/{dataset.id}/documents", params=params)
|
|
801
967
|
res_json = cast(JsonDict, cast(Any, res).json())
|
|
802
968
|
|
|
803
969
|
if res_json.get("code") != 0:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rosetta-cli
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.13b3
|
|
4
4
|
Summary: Rosetta CLI for publishing knowledge base content to RAGFlow
|
|
5
5
|
Author-email: Rosetta Team <rosetta-support@griddynamics.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -18,7 +18,7 @@ Requires-Python: >=3.12
|
|
|
18
18
|
Description-Content-Type: text/markdown
|
|
19
19
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.0
|
|
20
20
|
Requires-Dist: python-frontmatter<2.0.0,>=1.1.0
|
|
21
|
-
Requires-Dist: ragflow-sdk<0.
|
|
21
|
+
Requires-Dist: ragflow-sdk<0.26.0,>=0.25.1
|
|
22
22
|
Requires-Dist: requests<3.0.0,>=2.31.0
|
|
23
23
|
Requires-Dist: tqdm<5.0.0,>=4.67.0
|
|
24
24
|
Provides-Extra: dev
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
"""Shared path utilities for Rosetta CLI."""
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def resolve_workspace_root(path: Path) -> Path:
|
|
7
|
-
"""Resolve the workspace root for a publish target.
|
|
8
|
-
|
|
9
|
-
Preference order:
|
|
10
|
-
1. Parent of the topmost `instructions/` directory in the target path.
|
|
11
|
-
2. Nearest ancestor containing `.git`.
|
|
12
|
-
3. The target directory itself, or the parent for a file target.
|
|
13
|
-
"""
|
|
14
|
-
resolved = path.resolve()
|
|
15
|
-
container = resolved if resolved.is_dir() else resolved.parent
|
|
16
|
-
|
|
17
|
-
parts = container.parts
|
|
18
|
-
for index, part in enumerate(parts):
|
|
19
|
-
if part == "instructions" and index > 0:
|
|
20
|
-
return Path(*parts[:index])
|
|
21
|
-
|
|
22
|
-
current = container
|
|
23
|
-
while current != current.parent:
|
|
24
|
-
if (current / ".git").exists():
|
|
25
|
-
return current
|
|
26
|
-
current = current.parent
|
|
27
|
-
|
|
28
|
-
return container
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|