rosetta-cli 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rosetta_cli/__init__.py +12 -0
- rosetta_cli/__main__.py +6 -0
- rosetta_cli/cli.py +379 -0
- rosetta_cli/commands/__init__.py +5 -0
- rosetta_cli/commands/base_command.py +82 -0
- rosetta_cli/commands/cleanup_command.py +214 -0
- rosetta_cli/commands/list_command.py +70 -0
- rosetta_cli/commands/parse_command.py +205 -0
- rosetta_cli/commands/publish_command.py +113 -0
- rosetta_cli/commands/verify_command.py +46 -0
- rosetta_cli/ims_auth.py +124 -0
- rosetta_cli/ims_config.py +317 -0
- rosetta_cli/ims_publisher.py +859 -0
- rosetta_cli/ims_utils.py +28 -0
- rosetta_cli/ragflow_client.py +928 -0
- rosetta_cli/services/__init__.py +8 -0
- rosetta_cli/services/auth_service.py +114 -0
- rosetta_cli/services/dataset_service.py +72 -0
- rosetta_cli/services/document_data.py +408 -0
- rosetta_cli/services/document_service.py +357 -0
- rosetta_cli/typing_utils.py +49 -0
- rosetta_cli-2.0.0.dist-info/METADATA +639 -0
- rosetta_cli-2.0.0.dist-info/RECORD +26 -0
- rosetta_cli-2.0.0.dist-info/WHEEL +5 -0
- rosetta_cli-2.0.0.dist-info/entry_points.txt +2 -0
- rosetta_cli-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""IMS Services Package - Service layer for common operations"""
|
|
2
|
+
|
|
3
|
+
from .document_data import DocumentData
|
|
4
|
+
from .document_service import DocumentService
|
|
5
|
+
from .dataset_service import DatasetService
|
|
6
|
+
from .auth_service import AuthService
|
|
7
|
+
|
|
8
|
+
__all__ = ['DocumentData', 'DocumentService', 'DatasetService', 'AuthService']
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Authentication Service
|
|
3
|
+
|
|
4
|
+
Handles API key verification and authentication-related operations.
|
|
5
|
+
Eliminates code duplication across commands.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any, Dict, Optional
|
|
10
|
+
|
|
11
|
+
from ..ims_auth import IMSAuthManager
|
|
12
|
+
from ..ims_config import IMSConfig
|
|
13
|
+
from ..ragflow_client import RAGFlowClient
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class AuthService:
|
|
17
|
+
"""Service for handling authentication operations"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, client: RAGFlowClient, config: IMSConfig):
|
|
20
|
+
"""
|
|
21
|
+
Initialize AuthService.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
client: RAGFlow client instance
|
|
25
|
+
config: IMS configuration
|
|
26
|
+
"""
|
|
27
|
+
self.client = client
|
|
28
|
+
self.config = config
|
|
29
|
+
self.auth_manager = IMSAuthManager(client, config)
|
|
30
|
+
|
|
31
|
+
def verify_api_key(self) -> tuple[bool, float]:
|
|
32
|
+
"""
|
|
33
|
+
Verify API key with timing and display results.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Tuple of (success: bool, duration: float)
|
|
37
|
+
"""
|
|
38
|
+
start_time = time.time()
|
|
39
|
+
|
|
40
|
+
if not self.config.api_key:
|
|
41
|
+
print("✗ API key not configured")
|
|
42
|
+
return False, time.time() - start_time
|
|
43
|
+
|
|
44
|
+
print(f"→ Verifying API key for {self.config.base_url}")
|
|
45
|
+
|
|
46
|
+
success, error_msg = self.auth_manager.verify_api_key()
|
|
47
|
+
duration = time.time() - start_time
|
|
48
|
+
|
|
49
|
+
if success:
|
|
50
|
+
print(f"✓ API key is valid")
|
|
51
|
+
print(f" Authentication: {duration:.2f}s")
|
|
52
|
+
else:
|
|
53
|
+
print(f"\n✗ {error_msg}")
|
|
54
|
+
|
|
55
|
+
return success, duration
|
|
56
|
+
|
|
57
|
+
def verify_connection(self) -> bool:
|
|
58
|
+
"""
|
|
59
|
+
Verify full connection including server health and display results.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
True if connection successful, False otherwise
|
|
63
|
+
"""
|
|
64
|
+
success, error_msg = self.auth_manager.verify_connection()
|
|
65
|
+
|
|
66
|
+
if success:
|
|
67
|
+
print(f"✓ Connected to RAGFlow at {self.config.base_url}")
|
|
68
|
+
server_info = self.auth_manager.get_server_info()
|
|
69
|
+
if server_info:
|
|
70
|
+
self._display_server_info(server_info)
|
|
71
|
+
else:
|
|
72
|
+
print(f"\n✗ {error_msg}")
|
|
73
|
+
|
|
74
|
+
return success
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def verify_or_exit(client: RAGFlowClient, config: IMSConfig) -> None:
|
|
78
|
+
"""
|
|
79
|
+
Verify API key and exit if verification fails.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
client: RAGFlow client instance
|
|
83
|
+
config: IMS configuration
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
SystemExit: If verification fails
|
|
87
|
+
"""
|
|
88
|
+
auth_service = AuthService(client, config)
|
|
89
|
+
success, _ = auth_service.verify_api_key()
|
|
90
|
+
|
|
91
|
+
if not success:
|
|
92
|
+
print("\n✗ Authentication failed. Please check your API key.")
|
|
93
|
+
import sys
|
|
94
|
+
sys.exit(1)
|
|
95
|
+
|
|
96
|
+
def _display_server_info(self, server_info: Dict[str, Any]) -> None:
|
|
97
|
+
"""Display server information"""
|
|
98
|
+
print(f"\nServer Information:")
|
|
99
|
+
print(f" Environment: {server_info['environment']}")
|
|
100
|
+
print(f" Dataset default: {server_info['dataset_default']}")
|
|
101
|
+
print(f" Dataset template: {server_info['dataset_template']}")
|
|
102
|
+
|
|
103
|
+
if 'health_status' in server_info:
|
|
104
|
+
health_status = server_info.get('health_status', 'unknown')
|
|
105
|
+
if health_status == 'ok':
|
|
106
|
+
print(f"\n✓ System Health: All dependencies healthy")
|
|
107
|
+
elif health_status == 'nok':
|
|
108
|
+
print(f"\n⚠️ System Health: Some dependencies unhealthy")
|
|
109
|
+
services = server_info.get('services', {})
|
|
110
|
+
for service, status in services.items():
|
|
111
|
+
symbol = "✓" if status == "ok" else "✗"
|
|
112
|
+
print(f" {symbol} {service}: {status}")
|
|
113
|
+
elif health_status == 'unavailable':
|
|
114
|
+
print(f"\n⏸️ System Health: Check unavailable")
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dataset Service
|
|
3
|
+
|
|
4
|
+
Handles dataset resolution, auto-detection, and management operations.
|
|
5
|
+
Eliminates code duplication across commands.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
|
|
10
|
+
from ..ims_config import IMSConfig
|
|
11
|
+
from ..ragflow_client import RAGFlowClient
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DatasetService:
|
|
15
|
+
"""Service for handling dataset operations"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, client: RAGFlowClient, config: IMSConfig):
|
|
18
|
+
"""
|
|
19
|
+
Initialize DatasetService.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
client: RAGFlow client instance
|
|
23
|
+
config: IMS configuration
|
|
24
|
+
"""
|
|
25
|
+
self.client = client
|
|
26
|
+
self.config = config
|
|
27
|
+
|
|
28
|
+
def resolve_dataset_name(self, args_dataset: Optional[str]) -> tuple[Optional[str], bool]:
|
|
29
|
+
"""
|
|
30
|
+
Resolve dataset name with auto-detection.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
args_dataset: Dataset name from arguments (can be None)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Tuple of (dataset_name: str or None, auto_detected: bool)
|
|
37
|
+
Returns (None, False) if resolution fails
|
|
38
|
+
"""
|
|
39
|
+
# Explicit dataset provided
|
|
40
|
+
if args_dataset:
|
|
41
|
+
return args_dataset, False
|
|
42
|
+
|
|
43
|
+
# Try to find one matching the template pattern
|
|
44
|
+
template_prefix = self.config.dataset_template.split('{')[0]
|
|
45
|
+
|
|
46
|
+
# List all datasets since RAGFlow's name filter is exact match, not substring
|
|
47
|
+
# For prefix matching, we need client-side filtering
|
|
48
|
+
all_datasets = self.client.list_datasets(page_size=self.config.page_size)
|
|
49
|
+
|
|
50
|
+
# Filter for exact prefix match
|
|
51
|
+
matching = [ds for ds in all_datasets if ds.name.startswith(template_prefix)]
|
|
52
|
+
|
|
53
|
+
if len(matching) == 1:
|
|
54
|
+
print(f"Auto-detected dataset: {matching[0].name} (from template pattern '{template_prefix}*')")
|
|
55
|
+
return matching[0].name, True
|
|
56
|
+
elif len(matching) > 1:
|
|
57
|
+
print(f"Multiple datasets match pattern '{template_prefix}*':")
|
|
58
|
+
for ds in matching:
|
|
59
|
+
print(f" - {ds.name}")
|
|
60
|
+
print(f"\nPlease specify which dataset using --dataset flag")
|
|
61
|
+
return None, False
|
|
62
|
+
else:
|
|
63
|
+
# Fall back to default
|
|
64
|
+
print(f"Using default dataset: {self.config.dataset_default}")
|
|
65
|
+
return self.config.dataset_default, True
|
|
66
|
+
|
|
67
|
+
def display_available_datasets(self) -> None:
|
|
68
|
+
"""Display list of available datasets"""
|
|
69
|
+
datasets = self.client.list_datasets(page_size=self.config.page_size)
|
|
70
|
+
print(f"\nAvailable datasets:")
|
|
71
|
+
for ds in datasets:
|
|
72
|
+
print(f" - {ds.name}")
|
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
"""Document data model with frontmatter-aware metadata extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import hashlib
|
|
8
|
+
import uuid
|
|
9
|
+
from typing import List, Optional, cast
|
|
10
|
+
|
|
11
|
+
from ..typing_utils import JsonDict, JsonValue
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import frontmatter
|
|
15
|
+
except Exception: # pragma: no cover - guarded at runtime
|
|
16
|
+
frontmatter = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _is_r2_or_later(release: str) -> bool:
|
|
20
|
+
"""Check if release is r2 or later (r2, r3, r2.5, etc.)."""
|
|
21
|
+
if not release or not release.startswith("r"):
|
|
22
|
+
return False
|
|
23
|
+
try:
|
|
24
|
+
version = float(release[1:])
|
|
25
|
+
return version >= 2
|
|
26
|
+
except ValueError:
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class DocumentData:
|
|
32
|
+
ims_doc_id: str
|
|
33
|
+
file_path: Path
|
|
34
|
+
content: bytes
|
|
35
|
+
is_text: bool
|
|
36
|
+
content_str: str | None
|
|
37
|
+
tags: list[str]
|
|
38
|
+
domain: str
|
|
39
|
+
release: str
|
|
40
|
+
doc_title: str
|
|
41
|
+
original_path: str
|
|
42
|
+
sort_order: int | None
|
|
43
|
+
content_hash: str
|
|
44
|
+
line_count: int | None = None
|
|
45
|
+
resource_path: str | None = None
|
|
46
|
+
frontmatter: JsonDict | None = None
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def from_file(
|
|
50
|
+
cls,
|
|
51
|
+
file_path: Path,
|
|
52
|
+
workspace_root: Path,
|
|
53
|
+
file_extensions_text: List[str] | None = None,
|
|
54
|
+
publish_root: Path | None = None,
|
|
55
|
+
) -> "DocumentData":
|
|
56
|
+
del publish_root # Publish scope is physical; metadata paths are normalized from file path.
|
|
57
|
+
if file_extensions_text is None:
|
|
58
|
+
# All extensions that contain human-readable text (code, config, markup).
|
|
59
|
+
# Used ONLY for is_text detection (read as UTF-8 string), NOT as upload filter.
|
|
60
|
+
file_extensions_text = [
|
|
61
|
+
# Markdown / text
|
|
62
|
+
".md", ".mdx", ".txt", ".rst",
|
|
63
|
+
# Web / markup
|
|
64
|
+
".htm", ".html", ".xml", ".yml", ".yaml", ".json", ".jsonl", ".ldjson",
|
|
65
|
+
".csv", ".ini", ".rtf",
|
|
66
|
+
# Text data formats
|
|
67
|
+
".toml", ".cfg", ".conf", ".properties", ".env", ".log",
|
|
68
|
+
# Shell / scripts
|
|
69
|
+
".sh", ".bash", ".zsh", ".fish", ".ps1", ".psm1", ".bat", ".cmd",
|
|
70
|
+
# Code (all languages RAGFlow accepts + common extras)
|
|
71
|
+
".py", ".js", ".ts", ".java", ".c", ".cpp", ".h", ".php", ".go",
|
|
72
|
+
".cs", ".kt", ".sql", ".rb", ".rs", ".swift", ".r",
|
|
73
|
+
".jsx", ".tsx", ".vue", ".scss", ".css", ".less", ".sass",
|
|
74
|
+
".lua", ".pl", ".pm", ".groovy", ".gradle", ".scala",
|
|
75
|
+
".tf", ".hcl", ".dockerfile",
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
ims_doc_id = cls._generate_doc_id(file_path, workspace_root)
|
|
79
|
+
content = file_path.read_bytes()
|
|
80
|
+
is_text = file_path.suffix.lower() in file_extensions_text
|
|
81
|
+
|
|
82
|
+
content_str = None
|
|
83
|
+
if is_text:
|
|
84
|
+
try:
|
|
85
|
+
content_str = content.decode("utf-8")
|
|
86
|
+
except UnicodeDecodeError:
|
|
87
|
+
content_str = content.decode("utf-8", errors="ignore")
|
|
88
|
+
|
|
89
|
+
# Count lines platform-independently: \r\n, \n\r, \r, \n all count as separators
|
|
90
|
+
line_count = None
|
|
91
|
+
if content_str is not None:
|
|
92
|
+
import re as _re
|
|
93
|
+
line_count = len(_re.split(r'\r\n|\n\r|\r|\n', content_str))
|
|
94
|
+
|
|
95
|
+
instr_rel = cls._path_relative_to_instructions(file_path)
|
|
96
|
+
parsed_path = cls._parse_instructions_path(instr_rel)
|
|
97
|
+
path_tags, domain, release = cls._extract_path_metadata(
|
|
98
|
+
file_path=file_path,
|
|
99
|
+
workspace_root=workspace_root,
|
|
100
|
+
instr_rel=instr_rel,
|
|
101
|
+
parsed_path=parsed_path,
|
|
102
|
+
)
|
|
103
|
+
frontmatter_tags, sort_order, fm_dict = cls._extract_frontmatter_metadata(file_path, content_str)
|
|
104
|
+
tags = cls._merge_tags(path_tags, frontmatter_tags)
|
|
105
|
+
|
|
106
|
+
# instructions-relative path is the single source of truth for path-like metadata fields
|
|
107
|
+
original_path = instr_rel if instr_rel else file_path.name
|
|
108
|
+
resource_path = cls._compute_resource_path(parsed_path)
|
|
109
|
+
doc_title = cls._compute_doc_title(parsed_path, file_path.name)
|
|
110
|
+
|
|
111
|
+
content_hash = cls._calculate_hash(
|
|
112
|
+
content_str if content_str is not None else str(len(content)),
|
|
113
|
+
tags,
|
|
114
|
+
domain,
|
|
115
|
+
release,
|
|
116
|
+
doc_title,
|
|
117
|
+
doc_title,
|
|
118
|
+
sort_order,
|
|
119
|
+
original_path,
|
|
120
|
+
resource_path,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return cls(
|
|
124
|
+
ims_doc_id=ims_doc_id,
|
|
125
|
+
file_path=file_path,
|
|
126
|
+
content=content,
|
|
127
|
+
is_text=is_text,
|
|
128
|
+
content_str=content_str,
|
|
129
|
+
tags=tags,
|
|
130
|
+
domain=domain,
|
|
131
|
+
release=release,
|
|
132
|
+
doc_title=doc_title,
|
|
133
|
+
original_path=original_path,
|
|
134
|
+
sort_order=sort_order,
|
|
135
|
+
content_hash=content_hash,
|
|
136
|
+
line_count=line_count,
|
|
137
|
+
resource_path=resource_path,
|
|
138
|
+
frontmatter=fm_dict,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def _generate_doc_id(file_path: Path, workspace_root: Path) -> str:
|
|
143
|
+
try:
|
|
144
|
+
rel_path = file_path.relative_to(workspace_root)
|
|
145
|
+
except ValueError:
|
|
146
|
+
rel_path = file_path
|
|
147
|
+
|
|
148
|
+
path_str = str(rel_path).replace("\\", "/")
|
|
149
|
+
return str(uuid.uuid5(uuid.NAMESPACE_DNS, f"rulesofpower.{path_str}"))
|
|
150
|
+
|
|
151
|
+
@staticmethod
|
|
152
|
+
def _path_relative_to_instructions(file_path: Path) -> Optional[str]:
|
|
153
|
+
"""Get path relative to the first 'instructions' folder in the absolute path.
|
|
154
|
+
|
|
155
|
+
Uses the topmost (first) 'instructions' folder if multiple exist.
|
|
156
|
+
|
|
157
|
+
Examples:
|
|
158
|
+
/ws/instructions/r2/core/agents/planner.md -> r2/core/agents/planner.md
|
|
159
|
+
/ws/instructions/agents/r1/coding.md -> agents/r1/coding.md
|
|
160
|
+
/ws/other/file.md -> None
|
|
161
|
+
"""
|
|
162
|
+
parts = file_path.resolve().parts
|
|
163
|
+
for i, part in enumerate(parts):
|
|
164
|
+
if part == "instructions":
|
|
165
|
+
remaining = parts[i + 1:]
|
|
166
|
+
if remaining:
|
|
167
|
+
return "/".join(remaining)
|
|
168
|
+
return None
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
@dataclass
|
|
172
|
+
class ParsedInstructionsPath:
|
|
173
|
+
"""Decomposed instructions-relative path.
|
|
174
|
+
|
|
175
|
+
For R2 path r2/core/agents/coding.md:
|
|
176
|
+
release="r2", org="core",
|
|
177
|
+
rest=("agents", "coding.md"), filename="coding.md"
|
|
178
|
+
|
|
179
|
+
For R1 path agents/r1/coding.md:
|
|
180
|
+
release="r1", org=None,
|
|
181
|
+
rest=("coding.md",), filename="coding.md"
|
|
182
|
+
"""
|
|
183
|
+
release: str # release folder (r1, r2, ...)
|
|
184
|
+
org: Optional[str] # org folder (R2+ only, e.g. "core")
|
|
185
|
+
rest: tuple[str, ...] # remaining path parts after org (R2+) or release (R1)
|
|
186
|
+
filename: str # bare filename
|
|
187
|
+
|
|
188
|
+
@classmethod
|
|
189
|
+
def _parse_instructions_path(cls, instr_rel: Optional[str]) -> Optional["DocumentData.ParsedInstructionsPath"]:
|
|
190
|
+
"""Decompose instructions-relative path into semantic parts."""
|
|
191
|
+
if not instr_rel:
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
parts = tuple(instr_rel.split("/"))
|
|
195
|
+
release = next((part for part in parts if cls._is_release_tag(part)), "")
|
|
196
|
+
if not release:
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
release_idx = list(parts).index(release)
|
|
200
|
+
after_release = parts[release_idx + 1:]
|
|
201
|
+
filename = parts[-1]
|
|
202
|
+
|
|
203
|
+
if _is_r2_or_later(release) and len(after_release) >= 2:
|
|
204
|
+
org = after_release[0]
|
|
205
|
+
rest = after_release[1:]
|
|
206
|
+
else:
|
|
207
|
+
org = None
|
|
208
|
+
rest = after_release if after_release else (filename,)
|
|
209
|
+
|
|
210
|
+
return cls.ParsedInstructionsPath(
|
|
211
|
+
release=release,
|
|
212
|
+
org=org,
|
|
213
|
+
rest=rest,
|
|
214
|
+
filename=filename,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
@staticmethod
|
|
218
|
+
def _compute_doc_title(parsed_path: Optional["DocumentData.ParsedInstructionsPath"], fallback_filename: str) -> str:
|
|
219
|
+
"""Compute document title.
|
|
220
|
+
|
|
221
|
+
R2+: instructions-relative path minus release version.
|
|
222
|
+
e.g. r2/core/agents/coding.md -> core/agents/coding.md
|
|
223
|
+
R1: bare filename (e.g. coding.md)
|
|
224
|
+
No instructions folder: bare filename
|
|
225
|
+
"""
|
|
226
|
+
if not parsed_path or not _is_r2_or_later(parsed_path.release):
|
|
227
|
+
return fallback_filename
|
|
228
|
+
|
|
229
|
+
if parsed_path.org:
|
|
230
|
+
return "/".join((parsed_path.org, *parsed_path.rest))
|
|
231
|
+
return fallback_filename
|
|
232
|
+
|
|
233
|
+
@staticmethod
|
|
234
|
+
def _compute_resource_path(parsed_path: Optional["DocumentData.ParsedInstructionsPath"]) -> Optional[str]:
|
|
235
|
+
"""Compute resource_path: logical path stripped of release (and org for R2+).
|
|
236
|
+
|
|
237
|
+
R2+: strip release and org → e.g. r2/core/skills/planning/SKILL.md -> skills/planning/SKILL.md
|
|
238
|
+
R1: strip everything up to and including release → e.g. agents/r1/coding.md -> coding.md
|
|
239
|
+
"""
|
|
240
|
+
if not parsed_path:
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
if _is_r2_or_later(parsed_path.release):
|
|
244
|
+
return "/".join(parsed_path.rest) if parsed_path.rest else parsed_path.filename
|
|
245
|
+
return "/".join(parsed_path.rest) if parsed_path.rest else parsed_path.filename
|
|
246
|
+
|
|
247
|
+
@staticmethod
|
|
248
|
+
def _is_release_tag(tag: str) -> bool:
|
|
249
|
+
return bool(tag) and tag.startswith("r") and tag[1:].replace(".", "").isdigit()
|
|
250
|
+
|
|
251
|
+
@classmethod
|
|
252
|
+
def _extract_path_metadata(
|
|
253
|
+
cls,
|
|
254
|
+
file_path: Path,
|
|
255
|
+
workspace_root: Path,
|
|
256
|
+
instr_rel: Optional[str] = None,
|
|
257
|
+
parsed_path: Optional["DocumentData.ParsedInstructionsPath"] = None,
|
|
258
|
+
) -> tuple[List[str], str, str]:
|
|
259
|
+
if instr_rel:
|
|
260
|
+
normalized_parts = ("instructions", *instr_rel.split("/"))
|
|
261
|
+
return cls._extract_path_metadata_from_parts(normalized_parts, parsed_path)
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
rel_path = file_path.relative_to(workspace_root)
|
|
265
|
+
except ValueError:
|
|
266
|
+
return ([], "general", "")
|
|
267
|
+
|
|
268
|
+
return cls._extract_path_metadata_from_parts(rel_path.parts, parsed_path)
|
|
269
|
+
|
|
270
|
+
@classmethod
|
|
271
|
+
def _extract_path_metadata_from_parts(
|
|
272
|
+
cls,
|
|
273
|
+
path_parts: tuple[str, ...],
|
|
274
|
+
parsed_path: Optional["DocumentData.ParsedInstructionsPath"] = None
|
|
275
|
+
) -> tuple[List[str], str, str]:
|
|
276
|
+
parts = path_parts[:-1]
|
|
277
|
+
domain = parts[0] if parts else "general"
|
|
278
|
+
tags = list(parts) if parts else []
|
|
279
|
+
filename = path_parts[-1] if path_parts else ""
|
|
280
|
+
if filename:
|
|
281
|
+
tags.append(filename)
|
|
282
|
+
|
|
283
|
+
release = ""
|
|
284
|
+
for tag in tags:
|
|
285
|
+
if cls._is_release_tag(tag):
|
|
286
|
+
release = tag
|
|
287
|
+
break
|
|
288
|
+
|
|
289
|
+
# R2+ domain: folder after release in path
|
|
290
|
+
if _is_r2_or_later(release) and release in parts:
|
|
291
|
+
release_idx = list(parts).index(release)
|
|
292
|
+
if release_idx + 1 < len(parts):
|
|
293
|
+
domain = parts[release_idx + 1]
|
|
294
|
+
|
|
295
|
+
# Two-part and three-part tags: based on resource_path (parsed_path.rest)
|
|
296
|
+
# Use parsed_path.rest if available, otherwise fall back to full path parts
|
|
297
|
+
resource_parts = parsed_path.rest if parsed_path else None
|
|
298
|
+
|
|
299
|
+
if resource_parts:
|
|
300
|
+
# Two-part tag: <parent>/<filename> from resource_path
|
|
301
|
+
if len(resource_parts) >= 2:
|
|
302
|
+
two_part = f"{resource_parts[-2]}/{resource_parts[-1]}"
|
|
303
|
+
if two_part not in tags:
|
|
304
|
+
tags.append(two_part)
|
|
305
|
+
|
|
306
|
+
# Three-part tag: <grandparent>/<parent>/<filename> from resource_path
|
|
307
|
+
if len(resource_parts) >= 3:
|
|
308
|
+
three_part = f"{resource_parts[-3]}/{resource_parts[-2]}/{resource_parts[-1]}"
|
|
309
|
+
if three_part not in tags:
|
|
310
|
+
tags.append(three_part)
|
|
311
|
+
else:
|
|
312
|
+
# Fallback: use full path parts (for non-instructions files)
|
|
313
|
+
if len(parts) >= 1 and filename:
|
|
314
|
+
two_part = f"{parts[-1]}/{filename}"
|
|
315
|
+
if two_part not in tags:
|
|
316
|
+
tags.append(two_part)
|
|
317
|
+
|
|
318
|
+
if len(parts) >= 2 and filename:
|
|
319
|
+
three_part = f"{parts[-2]}/{parts[-1]}/{filename}"
|
|
320
|
+
if three_part not in tags:
|
|
321
|
+
tags.append(three_part)
|
|
322
|
+
|
|
323
|
+
return (tags, domain, release)
|
|
324
|
+
|
|
325
|
+
@staticmethod
|
|
326
|
+
def _extract_frontmatter_metadata(file_path: Path, content_str: str | None) -> tuple[list[str], int | None, JsonDict | None]:
|
|
327
|
+
if not content_str or file_path.suffix.lower() not in {".md", ".markdown"}:
|
|
328
|
+
return [], None, None
|
|
329
|
+
if frontmatter is None:
|
|
330
|
+
return [], None, None
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
post = frontmatter.loads(content_str)
|
|
334
|
+
except Exception:
|
|
335
|
+
return [], None, None
|
|
336
|
+
|
|
337
|
+
fm_dict = dict(post.metadata) if post.metadata else None
|
|
338
|
+
|
|
339
|
+
tags_value = post.metadata.get("tags", [])
|
|
340
|
+
if isinstance(tags_value, str):
|
|
341
|
+
fm_tags = [item.strip() for item in tags_value.split(",") if item.strip()]
|
|
342
|
+
elif isinstance(tags_value, list):
|
|
343
|
+
fm_tags = [str(item).strip() for item in tags_value if str(item).strip()]
|
|
344
|
+
else:
|
|
345
|
+
fm_tags = []
|
|
346
|
+
|
|
347
|
+
sort_order_raw = post.metadata.get("sort_order")
|
|
348
|
+
sort_order: Optional[int] = None
|
|
349
|
+
if sort_order_raw is not None:
|
|
350
|
+
try:
|
|
351
|
+
sort_order = int(sort_order_raw)
|
|
352
|
+
except (TypeError, ValueError):
|
|
353
|
+
sort_order = None
|
|
354
|
+
|
|
355
|
+
return fm_tags, sort_order, fm_dict
|
|
356
|
+
|
|
357
|
+
@staticmethod
|
|
358
|
+
def _merge_tags(path_tags: list[str], frontmatter_tags: list[str]) -> list[str]:
|
|
359
|
+
merged: list[str] = []
|
|
360
|
+
seen: set[str] = set()
|
|
361
|
+
for tag in [*path_tags, *frontmatter_tags]:
|
|
362
|
+
normalized = tag.lower()
|
|
363
|
+
if normalized in seen:
|
|
364
|
+
continue
|
|
365
|
+
seen.add(normalized)
|
|
366
|
+
merged.append(tag)
|
|
367
|
+
return merged
|
|
368
|
+
|
|
369
|
+
@staticmethod
|
|
370
|
+
def _calculate_hash(
|
|
371
|
+
content: str,
|
|
372
|
+
tags: list[str],
|
|
373
|
+
domain: str,
|
|
374
|
+
release: str,
|
|
375
|
+
title: str,
|
|
376
|
+
doc_name: str,
|
|
377
|
+
sort_order: int | None,
|
|
378
|
+
original_path: str = "",
|
|
379
|
+
resource_path: str | None = None,
|
|
380
|
+
) -> str:
|
|
381
|
+
sorted_tags = ",".join(sorted(tags, key=str.lower))
|
|
382
|
+
hash_input = (
|
|
383
|
+
f"{content}|tags:{sorted_tags}|domain:{domain}|release:{release}|title:{title}"
|
|
384
|
+
f"|doc_name:{doc_name}"
|
|
385
|
+
f"|sort_order:{sort_order if sort_order is not None else ''}"
|
|
386
|
+
f"|original_path:{original_path}"
|
|
387
|
+
f"|resource_path:{resource_path if resource_path is not None else ''}"
|
|
388
|
+
)
|
|
389
|
+
return hashlib.md5(hash_input.encode("utf-8")).hexdigest()
|
|
390
|
+
|
|
391
|
+
def to_metadata_dict(self) -> JsonDict:
|
|
392
|
+
meta: JsonDict = {
|
|
393
|
+
"tags": self.tags,
|
|
394
|
+
"domain": self.domain,
|
|
395
|
+
"release": self.release,
|
|
396
|
+
"content_hash": self.content_hash,
|
|
397
|
+
"ims_doc_id": self.ims_doc_id,
|
|
398
|
+
"original_path": self.original_path,
|
|
399
|
+
"doc_title": self.doc_title,
|
|
400
|
+
"sort_order": self.sort_order,
|
|
401
|
+
}
|
|
402
|
+
if self.line_count is not None:
|
|
403
|
+
meta["line_count"] = self.line_count
|
|
404
|
+
if self.resource_path is not None:
|
|
405
|
+
meta["resource_path"] = self.resource_path
|
|
406
|
+
if self.frontmatter is not None:
|
|
407
|
+
meta["frontmatter"] = cast(JsonValue, self.frontmatter)
|
|
408
|
+
return meta
|