rosetta-cli 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rosetta_cli/__init__.py +12 -0
- rosetta_cli/__main__.py +6 -0
- rosetta_cli/cli.py +379 -0
- rosetta_cli/commands/__init__.py +5 -0
- rosetta_cli/commands/base_command.py +82 -0
- rosetta_cli/commands/cleanup_command.py +214 -0
- rosetta_cli/commands/list_command.py +70 -0
- rosetta_cli/commands/parse_command.py +205 -0
- rosetta_cli/commands/publish_command.py +113 -0
- rosetta_cli/commands/verify_command.py +46 -0
- rosetta_cli/ims_auth.py +124 -0
- rosetta_cli/ims_config.py +317 -0
- rosetta_cli/ims_publisher.py +859 -0
- rosetta_cli/ims_utils.py +28 -0
- rosetta_cli/ragflow_client.py +928 -0
- rosetta_cli/services/__init__.py +8 -0
- rosetta_cli/services/auth_service.py +114 -0
- rosetta_cli/services/dataset_service.py +72 -0
- rosetta_cli/services/document_data.py +408 -0
- rosetta_cli/services/document_service.py +357 -0
- rosetta_cli/typing_utils.py +49 -0
- rosetta_cli-2.0.0.dist-info/METADATA +639 -0
- rosetta_cli-2.0.0.dist-info/RECORD +26 -0
- rosetta_cli-2.0.0.dist-info/WHEEL +5 -0
- rosetta_cli-2.0.0.dist-info/entry_points.txt +2 -0
- rosetta_cli-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration management for IMS Publishing
|
|
3
|
+
|
|
4
|
+
Handles environment variables and configuration loading for RAGFlow-based
|
|
5
|
+
IMS publishing tools.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from dotenv import load_dotenv
|
|
13
|
+
|
|
14
|
+
from .typing_utils import JsonDict
|
|
15
|
+
|
|
16
|
+
ENV_FILE_ENV_VAR = "ROSETTA_CLI_ENV_FILE"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _candidate_env_names(env_name: str | None) -> list[str]:
|
|
20
|
+
names: list[str] = []
|
|
21
|
+
if env_name:
|
|
22
|
+
names.append(f".env.{env_name}")
|
|
23
|
+
names.append(".env")
|
|
24
|
+
return names
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def find_env_file(env_name: str | None = None) -> Path | None:
|
|
28
|
+
"""
|
|
29
|
+
.env file discovery.
|
|
30
|
+
|
|
31
|
+
Searches for `.env` files in the current working directory and its parents.
|
|
32
|
+
The optional `ROSETTA_CLI_ENV_FILE` environment variable takes precedence.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
env_name: Environment name (e.g., "remote", "dev"). If provided,
|
|
36
|
+
looks for .env.<env_name> first, then falls back to .env
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Path to .env file if found, None otherwise
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
>>> find_env_file("remote")
|
|
43
|
+
Path('/project/.env.remote')
|
|
44
|
+
|
|
45
|
+
>>> find_env_file()
|
|
46
|
+
Path('/project/.env')
|
|
47
|
+
"""
|
|
48
|
+
explicit_env_file = os.getenv(ENV_FILE_ENV_VAR)
|
|
49
|
+
if explicit_env_file:
|
|
50
|
+
env_path = Path(explicit_env_file).expanduser()
|
|
51
|
+
return env_path if env_path.exists() else None
|
|
52
|
+
|
|
53
|
+
current = Path.cwd().resolve()
|
|
54
|
+
for search_dir in (current, *current.parents):
|
|
55
|
+
for env_filename in _candidate_env_names(env_name):
|
|
56
|
+
env_path = search_dir / env_filename
|
|
57
|
+
if env_path.exists():
|
|
58
|
+
return env_path
|
|
59
|
+
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class IMSConfig:
|
|
65
|
+
"""
|
|
66
|
+
RAGFlow configuration for IMS publishing.
|
|
67
|
+
|
|
68
|
+
Environment Variables:
|
|
69
|
+
RAGFLOW_BASE_URL: RAGFlow instance URL (e.g., http://ragflow.local)
|
|
70
|
+
RAGFLOW_API_KEY: API key for authentication
|
|
71
|
+
RAGFLOW_DATASET_DEFAULT: Default dataset name (default: "aia")
|
|
72
|
+
RAGFLOW_DATASET_TEMPLATE: Template for dataset names (default: "aia-{release}")
|
|
73
|
+
RAGFLOW_EMBEDDING_MODEL: Embedding model (format: model_name@provider)
|
|
74
|
+
RAGFLOW_CHUNK_METHOD: Chunking method (default: "naive")
|
|
75
|
+
RAGFLOW_CHUNK_TOKEN_NUM: Chunk size in tokens (default: 512)
|
|
76
|
+
RAGFLOW_DELIMITER: Delimiter for splitting chunks (default: \n)
|
|
77
|
+
RAGFLOW_AUTO_KEYWORDS: Auto-generate keywords per chunk (default: 0)
|
|
78
|
+
RAGFLOW_AUTO_QUESTIONS: Auto-generate questions per chunk (default: 0)
|
|
79
|
+
RAGFLOW_PAGE_SIZE: Page size for listing operations (default: 1000)
|
|
80
|
+
RAGFLOW_PARSE_TIMEOUT: Timeout for parsing operations in seconds (default: 300)
|
|
81
|
+
ENVIRONMENT: Environment name (default: "local")
|
|
82
|
+
|
|
83
|
+
Examples:
|
|
84
|
+
>>> config = IMSConfig.from_env(".env")
|
|
85
|
+
>>> print(config.base_url)
|
|
86
|
+
http://ragflow.local
|
|
87
|
+
|
|
88
|
+
>>> config = IMSConfig.from_env_vars()
|
|
89
|
+
>>> client = RAGFlowClient(config.api_key, config.base_url)
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
base_url: str
|
|
93
|
+
api_key: str
|
|
94
|
+
dataset_default: str = "aia"
|
|
95
|
+
dataset_template: str = "aia-{release}"
|
|
96
|
+
embedding_model: str | None = None
|
|
97
|
+
chunk_method: str = "naive"
|
|
98
|
+
parser_config: JsonDict | None = None
|
|
99
|
+
environment: str = "local"
|
|
100
|
+
page_size: int = 1000
|
|
101
|
+
parse_timeout: int = 300
|
|
102
|
+
|
|
103
|
+
@classmethod
|
|
104
|
+
def from_env(
|
|
105
|
+
cls,
|
|
106
|
+
env_file: str | None = None,
|
|
107
|
+
environment: str | None = None
|
|
108
|
+
) -> "IMSConfig":
|
|
109
|
+
"""
|
|
110
|
+
Load configuration from .env file.
|
|
111
|
+
|
|
112
|
+
Supports both explicit file paths and automatic discovery.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
env_file: Explicit path to .env file. If not provided,
|
|
116
|
+
uses auto-discovery via find_env_file(). The
|
|
117
|
+
ROSETTA_CLI_ENV_FILE environment variable also works.
|
|
118
|
+
environment: Environment name for auto-discovery (e.g., "remote").
|
|
119
|
+
Only used if env_file is not provided.
|
|
120
|
+
Looks for .env.<environment> or .env files.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
IMSConfig instance
|
|
124
|
+
|
|
125
|
+
Raises:
|
|
126
|
+
FileNotFoundError: If env file cannot be found or is not provided
|
|
127
|
+
ValueError: If required environment variables are missing
|
|
128
|
+
|
|
129
|
+
Examples:
|
|
130
|
+
# Explicit file path
|
|
131
|
+
>>> config = IMSConfig.from_env("ragflow.env")
|
|
132
|
+
|
|
133
|
+
# Auto-discovery with environment
|
|
134
|
+
>>> config = IMSConfig.from_env(environment="remote")
|
|
135
|
+
# Searches for: .env.remote, then .env
|
|
136
|
+
|
|
137
|
+
# Auto-discovery (looks for .env)
|
|
138
|
+
>>> config = IMSConfig.from_env()
|
|
139
|
+
"""
|
|
140
|
+
# Determine which file to load
|
|
141
|
+
env_path: Path
|
|
142
|
+
if env_file:
|
|
143
|
+
# Explicit file path provided
|
|
144
|
+
env_path = Path(env_file)
|
|
145
|
+
if not env_path.exists():
|
|
146
|
+
raise FileNotFoundError(f"Environment file not found: {env_file}")
|
|
147
|
+
else:
|
|
148
|
+
# Auto-discovery
|
|
149
|
+
discovered_env_path = find_env_file(environment)
|
|
150
|
+
if not discovered_env_path:
|
|
151
|
+
env_hint = f" (tried .env.{environment} and .env)" if environment else " (tried .env)"
|
|
152
|
+
raise FileNotFoundError(
|
|
153
|
+
f"No .env file found{env_hint}\n"
|
|
154
|
+
f"Current directory: {Path.cwd()}\n"
|
|
155
|
+
f"Env override: {os.getenv(ENV_FILE_ENV_VAR, '(not set)')}\n"
|
|
156
|
+
f"\nPlease create a .env file with RAGFLOW_BASE_URL and RAGFLOW_API_KEY"
|
|
157
|
+
)
|
|
158
|
+
env_path = discovered_env_path
|
|
159
|
+
|
|
160
|
+
# Load environment variables from file
|
|
161
|
+
load_dotenv(env_path)
|
|
162
|
+
|
|
163
|
+
return cls.from_env_vars(environment=environment)
|
|
164
|
+
|
|
165
|
+
@classmethod
|
|
166
|
+
def from_env_vars(cls, environment: str | None = None) -> "IMSConfig":
|
|
167
|
+
"""
|
|
168
|
+
Load configuration from environment variables.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
environment: Optional explicit environment name (e.g., "local",
|
|
172
|
+
"dev", "remote"). If provided, this value is used and takes
|
|
173
|
+
precedence over the ENVIRONMENT environment variable. If not
|
|
174
|
+
provided, the ENVIRONMENT variable is used, defaulting to
|
|
175
|
+
"local" when unset.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
IMSConfig instance
|
|
179
|
+
|
|
180
|
+
Raises:
|
|
181
|
+
ValueError: If RAGFLOW_API_KEY is missing
|
|
182
|
+
|
|
183
|
+
Examples:
|
|
184
|
+
>>> os.environ["RAGFLOW_BASE_URL"] = "http://ragflow.local"
|
|
185
|
+
>>> os.environ["RAGFLOW_API_KEY"] = "ragflow-xxx"
|
|
186
|
+
>>> config = IMSConfig.from_env_vars()
|
|
187
|
+
"""
|
|
188
|
+
base_url = os.getenv("RAGFLOW_BASE_URL", "http://ragflow.local")
|
|
189
|
+
api_key = os.getenv("RAGFLOW_API_KEY", "")
|
|
190
|
+
dataset_default = os.getenv("RAGFLOW_DATASET_DEFAULT", "aia")
|
|
191
|
+
dataset_template = os.getenv("RAGFLOW_DATASET_TEMPLATE", "aia-{release}")
|
|
192
|
+
# fallback to ENVIRONMENT env var, or default to "local"
|
|
193
|
+
environment = environment or os.getenv("ENVIRONMENT", "local") or "local"
|
|
194
|
+
|
|
195
|
+
# Dataset creation settings
|
|
196
|
+
embedding_model = os.getenv("RAGFLOW_EMBEDDING_MODEL") or None
|
|
197
|
+
chunk_method = os.getenv("RAGFLOW_CHUNK_METHOD", "naive")
|
|
198
|
+
|
|
199
|
+
# Pagination and timeout settings
|
|
200
|
+
page_size = int(os.getenv("RAGFLOW_PAGE_SIZE", "1000"))
|
|
201
|
+
parse_timeout = int(os.getenv("RAGFLOW_PARSE_TIMEOUT", "300"))
|
|
202
|
+
|
|
203
|
+
# Parser configuration for naive chunking
|
|
204
|
+
parser_config: JsonDict | None = None
|
|
205
|
+
if chunk_method == "naive":
|
|
206
|
+
chunk_token_num = int(os.getenv("RAGFLOW_CHUNK_TOKEN_NUM", "512"))
|
|
207
|
+
delimiter = os.getenv("RAGFLOW_DELIMITER", "\\n")
|
|
208
|
+
auto_keywords = int(os.getenv("RAGFLOW_AUTO_KEYWORDS", "0"))
|
|
209
|
+
auto_questions = int(os.getenv("RAGFLOW_AUTO_QUESTIONS", "0"))
|
|
210
|
+
|
|
211
|
+
parser_config = {
|
|
212
|
+
"chunk_token_num": chunk_token_num,
|
|
213
|
+
"delimiter": delimiter.encode().decode('unicode_escape'), # Handle \n escape
|
|
214
|
+
"auto_keywords": auto_keywords,
|
|
215
|
+
"auto_questions": auto_questions
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return cls(
|
|
219
|
+
base_url=base_url,
|
|
220
|
+
api_key=api_key,
|
|
221
|
+
dataset_default=dataset_default,
|
|
222
|
+
dataset_template=dataset_template,
|
|
223
|
+
embedding_model=embedding_model,
|
|
224
|
+
chunk_method=chunk_method,
|
|
225
|
+
parser_config=parser_config,
|
|
226
|
+
environment=environment,
|
|
227
|
+
page_size=page_size,
|
|
228
|
+
parse_timeout=parse_timeout
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
def validate(self) -> bool:
|
|
232
|
+
"""
|
|
233
|
+
Validate configuration.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
True if configuration is valid
|
|
237
|
+
|
|
238
|
+
Raises:
|
|
239
|
+
ValueError: If configuration is invalid
|
|
240
|
+
"""
|
|
241
|
+
if not self.base_url:
|
|
242
|
+
raise ValueError("base_url cannot be empty")
|
|
243
|
+
|
|
244
|
+
if not self.api_key:
|
|
245
|
+
raise ValueError("api_key cannot be empty")
|
|
246
|
+
|
|
247
|
+
if not self.base_url.startswith(("http://", "https://")):
|
|
248
|
+
raise ValueError(
|
|
249
|
+
f"base_url must start with http:// or https://, got: {self.base_url}"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if not self.api_key.startswith("ragflow-"):
|
|
253
|
+
print(
|
|
254
|
+
f"Warning: API key should start with 'ragflow-', got: {self.api_key[:10]}..."
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return True
|
|
258
|
+
|
|
259
|
+
def save_credentials(self, env_file: str = ".env") -> None:
|
|
260
|
+
"""
|
|
261
|
+
Save credentials to .env file.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
env_file: Path to .env file to create/update
|
|
265
|
+
|
|
266
|
+
Examples:
|
|
267
|
+
>>> config = IMSConfig(...)
|
|
268
|
+
>>> config.save_credentials("ragflow.env")
|
|
269
|
+
"""
|
|
270
|
+
env_path = Path(env_file)
|
|
271
|
+
|
|
272
|
+
# Read existing content if file exists
|
|
273
|
+
existing_lines: list[str] = []
|
|
274
|
+
ragflow_keys = {
|
|
275
|
+
"RAGFLOW_BASE_URL",
|
|
276
|
+
"RAGFLOW_API_KEY",
|
|
277
|
+
"RAGFLOW_DATASET_DEFAULT",
|
|
278
|
+
"RAGFLOW_DATASET_TEMPLATE",
|
|
279
|
+
"ENVIRONMENT"
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
if env_path.exists():
|
|
283
|
+
with open(env_path, 'r') as f:
|
|
284
|
+
for line in f:
|
|
285
|
+
# Keep lines that don't set RAGFlow variables
|
|
286
|
+
if not any(line.startswith(f"{key}=") for key in ragflow_keys):
|
|
287
|
+
existing_lines.append(line.rstrip())
|
|
288
|
+
|
|
289
|
+
# Build new content
|
|
290
|
+
new_lines = existing_lines + [
|
|
291
|
+
"",
|
|
292
|
+
"# RAGFlow Configuration",
|
|
293
|
+
f"RAGFLOW_BASE_URL={self.base_url}",
|
|
294
|
+
f"RAGFLOW_API_KEY={self.api_key}",
|
|
295
|
+
f"RAGFLOW_DATASET_DEFAULT={self.dataset_default}",
|
|
296
|
+
f"RAGFLOW_DATASET_TEMPLATE={self.dataset_template}",
|
|
297
|
+
f"ENVIRONMENT={self.environment}",
|
|
298
|
+
]
|
|
299
|
+
|
|
300
|
+
# Write to file
|
|
301
|
+
with open(env_path, 'w') as f:
|
|
302
|
+
f.write('\n'.join(new_lines) + '\n')
|
|
303
|
+
|
|
304
|
+
print(f"Saved configuration to {env_file}")
|
|
305
|
+
|
|
306
|
+
def __str__(self) -> str:
|
|
307
|
+
"""String representation (masks API key)"""
|
|
308
|
+
masked_key = f"{self.api_key[:10]}..." if len(self.api_key) > 10 else "***"
|
|
309
|
+
return (
|
|
310
|
+
f"IMSConfig(\n"
|
|
311
|
+
f" base_url={self.base_url}\n"
|
|
312
|
+
f" api_key={masked_key}\n"
|
|
313
|
+
f" dataset_default={self.dataset_default}\n"
|
|
314
|
+
f" dataset_template={self.dataset_template}\n"
|
|
315
|
+
f" environment={self.environment}\n"
|
|
316
|
+
f")"
|
|
317
|
+
)
|