rosetta-cli 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ """IMS Services Package - Service layer for common operations"""
2
+
3
+ from .document_data import DocumentData
4
+ from .document_service import DocumentService
5
+ from .dataset_service import DatasetService
6
+ from .auth_service import AuthService
7
+
8
+ __all__ = ['DocumentData', 'DocumentService', 'DatasetService', 'AuthService']
@@ -0,0 +1,114 @@
1
+ """
2
+ Authentication Service
3
+
4
+ Handles API key verification and authentication-related operations.
5
+ Eliminates code duplication across commands.
6
+ """
7
+
8
+ import time
9
+ from typing import Any, Dict, Optional
10
+
11
+ from ..ims_auth import IMSAuthManager
12
+ from ..ims_config import IMSConfig
13
+ from ..ragflow_client import RAGFlowClient
14
+
15
+
16
+ class AuthService:
17
+ """Service for handling authentication operations"""
18
+
19
+ def __init__(self, client: RAGFlowClient, config: IMSConfig):
20
+ """
21
+ Initialize AuthService.
22
+
23
+ Args:
24
+ client: RAGFlow client instance
25
+ config: IMS configuration
26
+ """
27
+ self.client = client
28
+ self.config = config
29
+ self.auth_manager = IMSAuthManager(client, config)
30
+
31
+ def verify_api_key(self) -> tuple[bool, float]:
32
+ """
33
+ Verify API key with timing and display results.
34
+
35
+ Returns:
36
+ Tuple of (success: bool, duration: float)
37
+ """
38
+ start_time = time.time()
39
+
40
+ if not self.config.api_key:
41
+ print("✗ API key not configured")
42
+ return False, time.time() - start_time
43
+
44
+ print(f"→ Verifying API key for {self.config.base_url}")
45
+
46
+ success, error_msg = self.auth_manager.verify_api_key()
47
+ duration = time.time() - start_time
48
+
49
+ if success:
50
+ print(f"✓ API key is valid")
51
+ print(f" Authentication: {duration:.2f}s")
52
+ else:
53
+ print(f"\n✗ {error_msg}")
54
+
55
+ return success, duration
56
+
57
+ def verify_connection(self) -> bool:
58
+ """
59
+ Verify full connection including server health and display results.
60
+
61
+ Returns:
62
+ True if connection successful, False otherwise
63
+ """
64
+ success, error_msg = self.auth_manager.verify_connection()
65
+
66
+ if success:
67
+ print(f"✓ Connected to RAGFlow at {self.config.base_url}")
68
+ server_info = self.auth_manager.get_server_info()
69
+ if server_info:
70
+ self._display_server_info(server_info)
71
+ else:
72
+ print(f"\n✗ {error_msg}")
73
+
74
+ return success
75
+
76
+ @staticmethod
77
+ def verify_or_exit(client: RAGFlowClient, config: IMSConfig) -> None:
78
+ """
79
+ Verify API key and exit if verification fails.
80
+
81
+ Args:
82
+ client: RAGFlow client instance
83
+ config: IMS configuration
84
+
85
+ Raises:
86
+ SystemExit: If verification fails
87
+ """
88
+ auth_service = AuthService(client, config)
89
+ success, _ = auth_service.verify_api_key()
90
+
91
+ if not success:
92
+ print("\n✗ Authentication failed. Please check your API key.")
93
+ import sys
94
+ sys.exit(1)
95
+
96
+ def _display_server_info(self, server_info: Dict[str, Any]) -> None:
97
+ """Display server information"""
98
+ print(f"\nServer Information:")
99
+ print(f" Environment: {server_info['environment']}")
100
+ print(f" Dataset default: {server_info['dataset_default']}")
101
+ print(f" Dataset template: {server_info['dataset_template']}")
102
+
103
+ if 'health_status' in server_info:
104
+ health_status = server_info.get('health_status', 'unknown')
105
+ if health_status == 'ok':
106
+ print(f"\n✓ System Health: All dependencies healthy")
107
+ elif health_status == 'nok':
108
+ print(f"\n⚠️ System Health: Some dependencies unhealthy")
109
+ services = server_info.get('services', {})
110
+ for service, status in services.items():
111
+ symbol = "✓" if status == "ok" else "✗"
112
+ print(f" {symbol} {service}: {status}")
113
+ elif health_status == 'unavailable':
114
+ print(f"\n⏸️ System Health: Check unavailable")
@@ -0,0 +1,72 @@
1
+ """
2
+ Dataset Service
3
+
4
+ Handles dataset resolution, auto-detection, and management operations.
5
+ Eliminates code duplication across commands.
6
+ """
7
+
8
+ from typing import List, Optional
9
+
10
+ from ..ims_config import IMSConfig
11
+ from ..ragflow_client import RAGFlowClient
12
+
13
+
14
+ class DatasetService:
15
+ """Service for handling dataset operations"""
16
+
17
+ def __init__(self, client: RAGFlowClient, config: IMSConfig):
18
+ """
19
+ Initialize DatasetService.
20
+
21
+ Args:
22
+ client: RAGFlow client instance
23
+ config: IMS configuration
24
+ """
25
+ self.client = client
26
+ self.config = config
27
+
28
+ def resolve_dataset_name(self, args_dataset: Optional[str]) -> tuple[Optional[str], bool]:
29
+ """
30
+ Resolve dataset name with auto-detection.
31
+
32
+ Args:
33
+ args_dataset: Dataset name from arguments (can be None)
34
+
35
+ Returns:
36
+ Tuple of (dataset_name: str or None, auto_detected: bool)
37
+ Returns (None, False) if resolution fails
38
+ """
39
+ # Explicit dataset provided
40
+ if args_dataset:
41
+ return args_dataset, False
42
+
43
+ # Try to find one matching the template pattern
44
+ template_prefix = self.config.dataset_template.split('{')[0]
45
+
46
+ # List all datasets since RAGFlow's name filter is exact match, not substring
47
+ # For prefix matching, we need client-side filtering
48
+ all_datasets = self.client.list_datasets(page_size=self.config.page_size)
49
+
50
+ # Filter for exact prefix match
51
+ matching = [ds for ds in all_datasets if ds.name.startswith(template_prefix)]
52
+
53
+ if len(matching) == 1:
54
+ print(f"Auto-detected dataset: {matching[0].name} (from template pattern '{template_prefix}*')")
55
+ return matching[0].name, True
56
+ elif len(matching) > 1:
57
+ print(f"Multiple datasets match pattern '{template_prefix}*':")
58
+ for ds in matching:
59
+ print(f" - {ds.name}")
60
+ print(f"\nPlease specify which dataset using --dataset flag")
61
+ return None, False
62
+ else:
63
+ # Fall back to default
64
+ print(f"Using default dataset: {self.config.dataset_default}")
65
+ return self.config.dataset_default, True
66
+
67
+ def display_available_datasets(self) -> None:
68
+ """Display list of available datasets"""
69
+ datasets = self.client.list_datasets(page_size=self.config.page_size)
70
+ print(f"\nAvailable datasets:")
71
+ for ds in datasets:
72
+ print(f" - {ds.name}")
@@ -0,0 +1,408 @@
1
+ """Document data model with frontmatter-aware metadata extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ import hashlib
8
+ import uuid
9
+ from typing import List, Optional, cast
10
+
11
+ from ..typing_utils import JsonDict, JsonValue
12
+
13
+ try:
14
+ import frontmatter
15
+ except Exception: # pragma: no cover - guarded at runtime
16
+ frontmatter = None
17
+
18
+
19
+ def _is_r2_or_later(release: str) -> bool:
20
+ """Check if release is r2 or later (r2, r3, r2.5, etc.)."""
21
+ if not release or not release.startswith("r"):
22
+ return False
23
+ try:
24
+ version = float(release[1:])
25
+ return version >= 2
26
+ except ValueError:
27
+ return False
28
+
29
+
30
+ @dataclass
31
+ class DocumentData:
32
+ ims_doc_id: str
33
+ file_path: Path
34
+ content: bytes
35
+ is_text: bool
36
+ content_str: str | None
37
+ tags: list[str]
38
+ domain: str
39
+ release: str
40
+ doc_title: str
41
+ original_path: str
42
+ sort_order: int | None
43
+ content_hash: str
44
+ line_count: int | None = None
45
+ resource_path: str | None = None
46
+ frontmatter: JsonDict | None = None
47
+
48
+ @classmethod
49
+ def from_file(
50
+ cls,
51
+ file_path: Path,
52
+ workspace_root: Path,
53
+ file_extensions_text: List[str] | None = None,
54
+ publish_root: Path | None = None,
55
+ ) -> "DocumentData":
56
+ del publish_root # Publish scope is physical; metadata paths are normalized from file path.
57
+ if file_extensions_text is None:
58
+ # All extensions that contain human-readable text (code, config, markup).
59
+ # Used ONLY for is_text detection (read as UTF-8 string), NOT as upload filter.
60
+ file_extensions_text = [
61
+ # Markdown / text
62
+ ".md", ".mdx", ".txt", ".rst",
63
+ # Web / markup
64
+ ".htm", ".html", ".xml", ".yml", ".yaml", ".json", ".jsonl", ".ldjson",
65
+ ".csv", ".ini", ".rtf",
66
+ # Text data formats
67
+ ".toml", ".cfg", ".conf", ".properties", ".env", ".log",
68
+ # Shell / scripts
69
+ ".sh", ".bash", ".zsh", ".fish", ".ps1", ".psm1", ".bat", ".cmd",
70
+ # Code (all languages RAGFlow accepts + common extras)
71
+ ".py", ".js", ".ts", ".java", ".c", ".cpp", ".h", ".php", ".go",
72
+ ".cs", ".kt", ".sql", ".rb", ".rs", ".swift", ".r",
73
+ ".jsx", ".tsx", ".vue", ".scss", ".css", ".less", ".sass",
74
+ ".lua", ".pl", ".pm", ".groovy", ".gradle", ".scala",
75
+ ".tf", ".hcl", ".dockerfile",
76
+ ]
77
+
78
+ ims_doc_id = cls._generate_doc_id(file_path, workspace_root)
79
+ content = file_path.read_bytes()
80
+ is_text = file_path.suffix.lower() in file_extensions_text
81
+
82
+ content_str = None
83
+ if is_text:
84
+ try:
85
+ content_str = content.decode("utf-8")
86
+ except UnicodeDecodeError:
87
+ content_str = content.decode("utf-8", errors="ignore")
88
+
89
+ # Count lines platform-independently: \r\n, \n\r, \r, \n all count as separators
90
+ line_count = None
91
+ if content_str is not None:
92
+ import re as _re
93
+ line_count = len(_re.split(r'\r\n|\n\r|\r|\n', content_str))
94
+
95
+ instr_rel = cls._path_relative_to_instructions(file_path)
96
+ parsed_path = cls._parse_instructions_path(instr_rel)
97
+ path_tags, domain, release = cls._extract_path_metadata(
98
+ file_path=file_path,
99
+ workspace_root=workspace_root,
100
+ instr_rel=instr_rel,
101
+ parsed_path=parsed_path,
102
+ )
103
+ frontmatter_tags, sort_order, fm_dict = cls._extract_frontmatter_metadata(file_path, content_str)
104
+ tags = cls._merge_tags(path_tags, frontmatter_tags)
105
+
106
+ # instructions-relative path is the single source of truth for path-like metadata fields
107
+ original_path = instr_rel if instr_rel else file_path.name
108
+ resource_path = cls._compute_resource_path(parsed_path)
109
+ doc_title = cls._compute_doc_title(parsed_path, file_path.name)
110
+
111
+ content_hash = cls._calculate_hash(
112
+ content_str if content_str is not None else str(len(content)),
113
+ tags,
114
+ domain,
115
+ release,
116
+ doc_title,
117
+ doc_title,
118
+ sort_order,
119
+ original_path,
120
+ resource_path,
121
+ )
122
+
123
+ return cls(
124
+ ims_doc_id=ims_doc_id,
125
+ file_path=file_path,
126
+ content=content,
127
+ is_text=is_text,
128
+ content_str=content_str,
129
+ tags=tags,
130
+ domain=domain,
131
+ release=release,
132
+ doc_title=doc_title,
133
+ original_path=original_path,
134
+ sort_order=sort_order,
135
+ content_hash=content_hash,
136
+ line_count=line_count,
137
+ resource_path=resource_path,
138
+ frontmatter=fm_dict,
139
+ )
140
+
141
+ @staticmethod
142
+ def _generate_doc_id(file_path: Path, workspace_root: Path) -> str:
143
+ try:
144
+ rel_path = file_path.relative_to(workspace_root)
145
+ except ValueError:
146
+ rel_path = file_path
147
+
148
+ path_str = str(rel_path).replace("\\", "/")
149
+ return str(uuid.uuid5(uuid.NAMESPACE_DNS, f"rulesofpower.{path_str}"))
150
+
151
+ @staticmethod
152
+ def _path_relative_to_instructions(file_path: Path) -> Optional[str]:
153
+ """Get path relative to the first 'instructions' folder in the absolute path.
154
+
155
+ Uses the topmost (first) 'instructions' folder if multiple exist.
156
+
157
+ Examples:
158
+ /ws/instructions/r2/core/agents/planner.md -> r2/core/agents/planner.md
159
+ /ws/instructions/agents/r1/coding.md -> agents/r1/coding.md
160
+ /ws/other/file.md -> None
161
+ """
162
+ parts = file_path.resolve().parts
163
+ for i, part in enumerate(parts):
164
+ if part == "instructions":
165
+ remaining = parts[i + 1:]
166
+ if remaining:
167
+ return "/".join(remaining)
168
+ return None
169
+ return None
170
+
171
+ @dataclass
172
+ class ParsedInstructionsPath:
173
+ """Decomposed instructions-relative path.
174
+
175
+ For R2 path r2/core/agents/coding.md:
176
+ release="r2", org="core",
177
+ rest=("agents", "coding.md"), filename="coding.md"
178
+
179
+ For R1 path agents/r1/coding.md:
180
+ release="r1", org=None,
181
+ rest=("coding.md",), filename="coding.md"
182
+ """
183
+ release: str # release folder (r1, r2, ...)
184
+ org: Optional[str] # org folder (R2+ only, e.g. "core")
185
+ rest: tuple[str, ...] # remaining path parts after org (R2+) or release (R1)
186
+ filename: str # bare filename
187
+
188
+ @classmethod
189
+ def _parse_instructions_path(cls, instr_rel: Optional[str]) -> Optional["DocumentData.ParsedInstructionsPath"]:
190
+ """Decompose instructions-relative path into semantic parts."""
191
+ if not instr_rel:
192
+ return None
193
+
194
+ parts = tuple(instr_rel.split("/"))
195
+ release = next((part for part in parts if cls._is_release_tag(part)), "")
196
+ if not release:
197
+ return None
198
+
199
+ release_idx = list(parts).index(release)
200
+ after_release = parts[release_idx + 1:]
201
+ filename = parts[-1]
202
+
203
+ if _is_r2_or_later(release) and len(after_release) >= 2:
204
+ org = after_release[0]
205
+ rest = after_release[1:]
206
+ else:
207
+ org = None
208
+ rest = after_release if after_release else (filename,)
209
+
210
+ return cls.ParsedInstructionsPath(
211
+ release=release,
212
+ org=org,
213
+ rest=rest,
214
+ filename=filename,
215
+ )
216
+
217
+ @staticmethod
218
+ def _compute_doc_title(parsed_path: Optional["DocumentData.ParsedInstructionsPath"], fallback_filename: str) -> str:
219
+ """Compute document title.
220
+
221
+ R2+: instructions-relative path minus release version.
222
+ e.g. r2/core/agents/coding.md -> core/agents/coding.md
223
+ R1: bare filename (e.g. coding.md)
224
+ No instructions folder: bare filename
225
+ """
226
+ if not parsed_path or not _is_r2_or_later(parsed_path.release):
227
+ return fallback_filename
228
+
229
+ if parsed_path.org:
230
+ return "/".join((parsed_path.org, *parsed_path.rest))
231
+ return fallback_filename
232
+
233
+ @staticmethod
234
+ def _compute_resource_path(parsed_path: Optional["DocumentData.ParsedInstructionsPath"]) -> Optional[str]:
235
+ """Compute resource_path: logical path stripped of release (and org for R2+).
236
+
237
+ R2+: strip release and org → e.g. r2/core/skills/planning/SKILL.md -> skills/planning/SKILL.md
238
+ R1: strip everything up to and including release → e.g. agents/r1/coding.md -> coding.md
239
+ """
240
+ if not parsed_path:
241
+ return None
242
+
243
+ if _is_r2_or_later(parsed_path.release):
244
+ return "/".join(parsed_path.rest) if parsed_path.rest else parsed_path.filename
245
+ return "/".join(parsed_path.rest) if parsed_path.rest else parsed_path.filename
246
+
247
+ @staticmethod
248
+ def _is_release_tag(tag: str) -> bool:
249
+ return bool(tag) and tag.startswith("r") and tag[1:].replace(".", "").isdigit()
250
+
251
+ @classmethod
252
+ def _extract_path_metadata(
253
+ cls,
254
+ file_path: Path,
255
+ workspace_root: Path,
256
+ instr_rel: Optional[str] = None,
257
+ parsed_path: Optional["DocumentData.ParsedInstructionsPath"] = None,
258
+ ) -> tuple[List[str], str, str]:
259
+ if instr_rel:
260
+ normalized_parts = ("instructions", *instr_rel.split("/"))
261
+ return cls._extract_path_metadata_from_parts(normalized_parts, parsed_path)
262
+
263
+ try:
264
+ rel_path = file_path.relative_to(workspace_root)
265
+ except ValueError:
266
+ return ([], "general", "")
267
+
268
+ return cls._extract_path_metadata_from_parts(rel_path.parts, parsed_path)
269
+
270
+ @classmethod
271
+ def _extract_path_metadata_from_parts(
272
+ cls,
273
+ path_parts: tuple[str, ...],
274
+ parsed_path: Optional["DocumentData.ParsedInstructionsPath"] = None
275
+ ) -> tuple[List[str], str, str]:
276
+ parts = path_parts[:-1]
277
+ domain = parts[0] if parts else "general"
278
+ tags = list(parts) if parts else []
279
+ filename = path_parts[-1] if path_parts else ""
280
+ if filename:
281
+ tags.append(filename)
282
+
283
+ release = ""
284
+ for tag in tags:
285
+ if cls._is_release_tag(tag):
286
+ release = tag
287
+ break
288
+
289
+ # R2+ domain: folder after release in path
290
+ if _is_r2_or_later(release) and release in parts:
291
+ release_idx = list(parts).index(release)
292
+ if release_idx + 1 < len(parts):
293
+ domain = parts[release_idx + 1]
294
+
295
+ # Two-part and three-part tags: based on resource_path (parsed_path.rest)
296
+ # Use parsed_path.rest if available, otherwise fall back to full path parts
297
+ resource_parts = parsed_path.rest if parsed_path else None
298
+
299
+ if resource_parts:
300
+ # Two-part tag: <parent>/<filename> from resource_path
301
+ if len(resource_parts) >= 2:
302
+ two_part = f"{resource_parts[-2]}/{resource_parts[-1]}"
303
+ if two_part not in tags:
304
+ tags.append(two_part)
305
+
306
+ # Three-part tag: <grandparent>/<parent>/<filename> from resource_path
307
+ if len(resource_parts) >= 3:
308
+ three_part = f"{resource_parts[-3]}/{resource_parts[-2]}/{resource_parts[-1]}"
309
+ if three_part not in tags:
310
+ tags.append(three_part)
311
+ else:
312
+ # Fallback: use full path parts (for non-instructions files)
313
+ if len(parts) >= 1 and filename:
314
+ two_part = f"{parts[-1]}/{filename}"
315
+ if two_part not in tags:
316
+ tags.append(two_part)
317
+
318
+ if len(parts) >= 2 and filename:
319
+ three_part = f"{parts[-2]}/{parts[-1]}/{filename}"
320
+ if three_part not in tags:
321
+ tags.append(three_part)
322
+
323
+ return (tags, domain, release)
324
+
325
+ @staticmethod
326
+ def _extract_frontmatter_metadata(file_path: Path, content_str: str | None) -> tuple[list[str], int | None, JsonDict | None]:
327
+ if not content_str or file_path.suffix.lower() not in {".md", ".markdown"}:
328
+ return [], None, None
329
+ if frontmatter is None:
330
+ return [], None, None
331
+
332
+ try:
333
+ post = frontmatter.loads(content_str)
334
+ except Exception:
335
+ return [], None, None
336
+
337
+ fm_dict = dict(post.metadata) if post.metadata else None
338
+
339
+ tags_value = post.metadata.get("tags", [])
340
+ if isinstance(tags_value, str):
341
+ fm_tags = [item.strip() for item in tags_value.split(",") if item.strip()]
342
+ elif isinstance(tags_value, list):
343
+ fm_tags = [str(item).strip() for item in tags_value if str(item).strip()]
344
+ else:
345
+ fm_tags = []
346
+
347
+ sort_order_raw = post.metadata.get("sort_order")
348
+ sort_order: Optional[int] = None
349
+ if sort_order_raw is not None:
350
+ try:
351
+ sort_order = int(sort_order_raw)
352
+ except (TypeError, ValueError):
353
+ sort_order = None
354
+
355
+ return fm_tags, sort_order, fm_dict
356
+
357
+ @staticmethod
358
+ def _merge_tags(path_tags: list[str], frontmatter_tags: list[str]) -> list[str]:
359
+ merged: list[str] = []
360
+ seen: set[str] = set()
361
+ for tag in [*path_tags, *frontmatter_tags]:
362
+ normalized = tag.lower()
363
+ if normalized in seen:
364
+ continue
365
+ seen.add(normalized)
366
+ merged.append(tag)
367
+ return merged
368
+
369
+ @staticmethod
370
+ def _calculate_hash(
371
+ content: str,
372
+ tags: list[str],
373
+ domain: str,
374
+ release: str,
375
+ title: str,
376
+ doc_name: str,
377
+ sort_order: int | None,
378
+ original_path: str = "",
379
+ resource_path: str | None = None,
380
+ ) -> str:
381
+ sorted_tags = ",".join(sorted(tags, key=str.lower))
382
+ hash_input = (
383
+ f"{content}|tags:{sorted_tags}|domain:{domain}|release:{release}|title:{title}"
384
+ f"|doc_name:{doc_name}"
385
+ f"|sort_order:{sort_order if sort_order is not None else ''}"
386
+ f"|original_path:{original_path}"
387
+ f"|resource_path:{resource_path if resource_path is not None else ''}"
388
+ )
389
+ return hashlib.md5(hash_input.encode("utf-8")).hexdigest()
390
+
391
+ def to_metadata_dict(self) -> JsonDict:
392
+ meta: JsonDict = {
393
+ "tags": self.tags,
394
+ "domain": self.domain,
395
+ "release": self.release,
396
+ "content_hash": self.content_hash,
397
+ "ims_doc_id": self.ims_doc_id,
398
+ "original_path": self.original_path,
399
+ "doc_title": self.doc_title,
400
+ "sort_order": self.sort_order,
401
+ }
402
+ if self.line_count is not None:
403
+ meta["line_count"] = self.line_count
404
+ if self.resource_path is not None:
405
+ meta["resource_path"] = self.resource_path
406
+ if self.frontmatter is not None:
407
+ meta["frontmatter"] = cast(JsonValue, self.frontmatter)
408
+ return meta