rosetta-cli 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rosetta_cli/__init__.py +12 -0
- rosetta_cli/__main__.py +6 -0
- rosetta_cli/cli.py +379 -0
- rosetta_cli/commands/__init__.py +5 -0
- rosetta_cli/commands/base_command.py +82 -0
- rosetta_cli/commands/cleanup_command.py +214 -0
- rosetta_cli/commands/list_command.py +70 -0
- rosetta_cli/commands/parse_command.py +205 -0
- rosetta_cli/commands/publish_command.py +113 -0
- rosetta_cli/commands/verify_command.py +46 -0
- rosetta_cli/ims_auth.py +124 -0
- rosetta_cli/ims_config.py +317 -0
- rosetta_cli/ims_publisher.py +859 -0
- rosetta_cli/ims_utils.py +28 -0
- rosetta_cli/ragflow_client.py +928 -0
- rosetta_cli/services/__init__.py +8 -0
- rosetta_cli/services/auth_service.py +114 -0
- rosetta_cli/services/dataset_service.py +72 -0
- rosetta_cli/services/document_data.py +408 -0
- rosetta_cli/services/document_service.py +357 -0
- rosetta_cli/typing_utils.py +49 -0
- rosetta_cli-2.0.0.dist-info/METADATA +639 -0
- rosetta_cli-2.0.0.dist-info/RECORD +26 -0
- rosetta_cli-2.0.0.dist-info/WHEEL +5 -0
- rosetta_cli-2.0.0.dist-info/entry_points.txt +2 -0
- rosetta_cli-2.0.0.dist-info/top_level.txt +1 -0
rosetta_cli/__init__.py
ADDED
rosetta_cli/__main__.py
ADDED
rosetta_cli/cli.py
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
"""Rosetta CLI entry point."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import sys
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from typing import TypeAlias
|
|
8
|
+
|
|
9
|
+
from .commands.base_command import BaseCommand
|
|
10
|
+
from .commands.cleanup_command import CleanupCommand
|
|
11
|
+
from .commands.list_command import ListCommand
|
|
12
|
+
from .commands.parse_command import ParseCommand
|
|
13
|
+
from .commands.publish_command import PublishCommand
|
|
14
|
+
from .commands.verify_command import VerifyCommand
|
|
15
|
+
from .ims_config import IMSConfig
|
|
16
|
+
from .ragflow_client import RAGFlowClient
|
|
17
|
+
from .typing_utils import CommandArgs
|
|
18
|
+
|
|
19
|
+
CommandClass: TypeAlias = Callable[[RAGFlowClient, IMSConfig], BaseCommand]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Command registry mapping command names to their classes
|
|
23
|
+
COMMAND_REGISTRY: dict[str, CommandClass] = {
|
|
24
|
+
'publish': PublishCommand,
|
|
25
|
+
'verify': VerifyCommand,
|
|
26
|
+
'list-dataset': ListCommand,
|
|
27
|
+
'cleanup-dataset': CleanupCommand,
|
|
28
|
+
'parse': ParseCommand,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def execute_command(command_name: str, args: CommandArgs, client: RAGFlowClient, config: IMSConfig) -> int:
|
|
33
|
+
"""
|
|
34
|
+
Execute a command by name using the command registry.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
command_name: Name of the command to execute
|
|
38
|
+
args: Parsed command-line arguments
|
|
39
|
+
client: RAGFlow client instance
|
|
40
|
+
config: RAGFlow configuration
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Exit code (0 for success, 1 for failure)
|
|
44
|
+
"""
|
|
45
|
+
command_class = COMMAND_REGISTRY.get(command_name)
|
|
46
|
+
if not command_class:
|
|
47
|
+
print(f"Unknown command: {command_name}")
|
|
48
|
+
return 1
|
|
49
|
+
|
|
50
|
+
command = command_class(client, config)
|
|
51
|
+
return command.execute(args)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def main() -> int:
|
|
55
|
+
"""Main CLI entry point."""
|
|
56
|
+
parser = argparse.ArgumentParser(
|
|
57
|
+
description="Rosetta CLI - Publish knowledge base content to RAGFlow and manage datasets\n"
|
|
58
|
+
"All commands include performance timing measurements.",
|
|
59
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
60
|
+
epilog="""
|
|
61
|
+
Examples:
|
|
62
|
+
# Publish knowledge base content from a folder (with timing)
|
|
63
|
+
rosetta-cli publish ../instructions
|
|
64
|
+
|
|
65
|
+
# Publish with dry-run (no actual upload)
|
|
66
|
+
rosetta-cli publish ../business --dry-run
|
|
67
|
+
|
|
68
|
+
# Force republish (ignore change detection)
|
|
69
|
+
rosetta-cli publish ../instructions --force
|
|
70
|
+
|
|
71
|
+
# List documents in dataset
|
|
72
|
+
rosetta-cli list-dataset
|
|
73
|
+
|
|
74
|
+
# List documents in specific dataset
|
|
75
|
+
rosetta-cli list-dataset --dataset aia-r1
|
|
76
|
+
|
|
77
|
+
# Cleanup (delete all documents from) a dataset
|
|
78
|
+
rosetta-cli cleanup-dataset --dataset aia-r1
|
|
79
|
+
|
|
80
|
+
# Preview cleanup with dry-run (shows what would be deleted)
|
|
81
|
+
rosetta-cli cleanup-dataset --dataset aia-r1 --dry-run
|
|
82
|
+
|
|
83
|
+
# Cleanup documents with specific prefix
|
|
84
|
+
rosetta-cli cleanup-dataset --dataset aia-r1 --prefix "aqa-phase"
|
|
85
|
+
|
|
86
|
+
# Cleanup documents with specific tags (space-separated)
|
|
87
|
+
rosetta-cli cleanup-dataset --dataset aia-r1 --tags "r1 agents"
|
|
88
|
+
|
|
89
|
+
# Cleanup documents with specific tags (comma-separated)
|
|
90
|
+
rosetta-cli cleanup-dataset --dataset aia-r1 --tags "r1,agents"
|
|
91
|
+
|
|
92
|
+
# Preview cleanup with prefix filter
|
|
93
|
+
rosetta-cli cleanup-dataset --dataset aia-r1 --prefix "aqa-phase" --dry-run
|
|
94
|
+
|
|
95
|
+
# Preview cleanup with tags filter
|
|
96
|
+
rosetta-cli cleanup-dataset --dataset aia-r1 --tags "r1 agents" --dry-run
|
|
97
|
+
|
|
98
|
+
# Cleanup with force (skip confirmation)
|
|
99
|
+
rosetta-cli cleanup-dataset --dataset aia-r1 --force
|
|
100
|
+
|
|
101
|
+
# Cleanup with prefix and force
|
|
102
|
+
rosetta-cli cleanup-dataset --dataset aia-r1 --prefix "aqa-phase" --force
|
|
103
|
+
|
|
104
|
+
# Cleanup with tags and force
|
|
105
|
+
rosetta-cli cleanup-dataset --dataset aia-r1 --tags "r1,agents" --force
|
|
106
|
+
|
|
107
|
+
# Parse documents in dataset (retry failed/unparsed)
|
|
108
|
+
rosetta-cli parse --dataset aia-r1
|
|
109
|
+
|
|
110
|
+
# Force re-parse all documents (e.g., after changing parser config)
|
|
111
|
+
rosetta-cli parse --dataset aia-r1 --force
|
|
112
|
+
|
|
113
|
+
# Preview which documents would be parsed
|
|
114
|
+
rosetta-cli parse --dataset aia-r1 --dry-run
|
|
115
|
+
|
|
116
|
+
# Verify connection
|
|
117
|
+
rosetta-cli verify
|
|
118
|
+
|
|
119
|
+
# Use different environment
|
|
120
|
+
rosetta-cli publish ../instructions --env production
|
|
121
|
+
|
|
122
|
+
Performance Notes:
|
|
123
|
+
- All commands show execution time (ā±ļø Total time: X.XXs)
|
|
124
|
+
- Publishing ~10-15s per file (embedding generation)
|
|
125
|
+
- Change detection skips unchanged files (77% faster)
|
|
126
|
+
- API key verification timing shown when applicable
|
|
127
|
+
|
|
128
|
+
Tag-in-Title Format:
|
|
129
|
+
- Documents are published with tags in title: [tag1][tag2] filename.ext
|
|
130
|
+
- Example: [instructions][agents][r1] agents.md
|
|
131
|
+
|
|
132
|
+
Frontmatter Metadata (publish flow):
|
|
133
|
+
- Supported keys: tags, sort_order
|
|
134
|
+
- tags can be list or comma-separated string
|
|
135
|
+
- tags are merged with path-based tags (case-insensitive dedupe)
|
|
136
|
+
- sort_order is persisted to metadata and affects MCP bundling order
|
|
137
|
+
- original_path/resource_path are normalized from instructions-relative path when applicable
|
|
138
|
+
"""
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Global arguments
|
|
142
|
+
parser.add_argument(
|
|
143
|
+
'--env',
|
|
144
|
+
type=str,
|
|
145
|
+
default=None,
|
|
146
|
+
help='Environment (local, dev, test, production)'
|
|
147
|
+
)
|
|
148
|
+
parser.add_argument(
|
|
149
|
+
'--env-file',
|
|
150
|
+
type=str,
|
|
151
|
+
default=None,
|
|
152
|
+
help='Explicit path to a .env file'
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Subcommands
|
|
156
|
+
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
|
|
157
|
+
|
|
158
|
+
# Publish command
|
|
159
|
+
publish_parser = subparsers.add_parser(
|
|
160
|
+
'publish',
|
|
161
|
+
help='Publish knowledge base content to RAGFlow'
|
|
162
|
+
)
|
|
163
|
+
publish_parser.add_argument(
|
|
164
|
+
'path',
|
|
165
|
+
type=str,
|
|
166
|
+
help='Path to content file or folder (e.g., instructions/, business/)'
|
|
167
|
+
)
|
|
168
|
+
publish_parser.add_argument(
|
|
169
|
+
'--dry-run',
|
|
170
|
+
action='store_true',
|
|
171
|
+
help='Simulate publishing without actual upload'
|
|
172
|
+
)
|
|
173
|
+
publish_parser.add_argument(
|
|
174
|
+
'--force',
|
|
175
|
+
action='store_true',
|
|
176
|
+
help='Force republish all files, ignoring change detection'
|
|
177
|
+
)
|
|
178
|
+
publish_parser.add_argument(
|
|
179
|
+
'--no-parse',
|
|
180
|
+
action='store_true',
|
|
181
|
+
help='Skip parsing documents after upload (for debugging)'
|
|
182
|
+
)
|
|
183
|
+
publish_parser.add_argument(
|
|
184
|
+
'--parse-timeout',
|
|
185
|
+
type=int,
|
|
186
|
+
default=300,
|
|
187
|
+
help='Timeout for parsing in seconds (default: 300)'
|
|
188
|
+
)
|
|
189
|
+
publish_parser.add_argument(
|
|
190
|
+
'--env',
|
|
191
|
+
type=str,
|
|
192
|
+
default=None,
|
|
193
|
+
help='Environment (local, dev, test, production)'
|
|
194
|
+
)
|
|
195
|
+
publish_parser.add_argument(
|
|
196
|
+
'--env-file',
|
|
197
|
+
type=str,
|
|
198
|
+
default=None,
|
|
199
|
+
help='Explicit path to a .env file'
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Verify command
|
|
203
|
+
verify_parser = subparsers.add_parser(
|
|
204
|
+
'verify',
|
|
205
|
+
help='Verify RAGFlow connection and API key'
|
|
206
|
+
)
|
|
207
|
+
verify_parser.add_argument(
|
|
208
|
+
'--env',
|
|
209
|
+
type=str,
|
|
210
|
+
default=None,
|
|
211
|
+
help='Environment (local, dev, test, production)'
|
|
212
|
+
)
|
|
213
|
+
verify_parser.add_argument(
|
|
214
|
+
'--env-file',
|
|
215
|
+
type=str,
|
|
216
|
+
default=None,
|
|
217
|
+
help='Explicit path to a .env file'
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# List dataset command
|
|
221
|
+
list_parser = subparsers.add_parser(
|
|
222
|
+
'list-dataset',
|
|
223
|
+
help='List documents in a dataset'
|
|
224
|
+
)
|
|
225
|
+
list_parser.add_argument(
|
|
226
|
+
'--dataset',
|
|
227
|
+
type=str,
|
|
228
|
+
default=None,
|
|
229
|
+
help='Dataset name (defaults to configured dataset)'
|
|
230
|
+
)
|
|
231
|
+
list_parser.add_argument(
|
|
232
|
+
'--env',
|
|
233
|
+
type=str,
|
|
234
|
+
default=None,
|
|
235
|
+
help='Environment (local, dev, test, production)'
|
|
236
|
+
)
|
|
237
|
+
list_parser.add_argument(
|
|
238
|
+
'--env-file',
|
|
239
|
+
type=str,
|
|
240
|
+
default=None,
|
|
241
|
+
help='Explicit path to a .env file'
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Cleanup dataset command
|
|
245
|
+
cleanup_parser = subparsers.add_parser(
|
|
246
|
+
'cleanup-dataset',
|
|
247
|
+
help='Delete all documents from a dataset'
|
|
248
|
+
)
|
|
249
|
+
cleanup_parser.add_argument(
|
|
250
|
+
'--dataset',
|
|
251
|
+
type=str,
|
|
252
|
+
default=None,
|
|
253
|
+
help='Dataset name (defaults to configured dataset)'
|
|
254
|
+
)
|
|
255
|
+
cleanup_parser.add_argument(
|
|
256
|
+
'--prefix',
|
|
257
|
+
type=str,
|
|
258
|
+
default=None,
|
|
259
|
+
help='Only delete documents with titles starting with this prefix (e.g., "aqa-phase")'
|
|
260
|
+
)
|
|
261
|
+
cleanup_parser.add_argument(
|
|
262
|
+
'--tags',
|
|
263
|
+
type=str,
|
|
264
|
+
default=None,
|
|
265
|
+
help='Only delete documents with these tags (space or comma separated, e.g., "r1 agents" or "r1,agents")'
|
|
266
|
+
)
|
|
267
|
+
cleanup_parser.add_argument(
|
|
268
|
+
'--dry-run',
|
|
269
|
+
action='store_true',
|
|
270
|
+
help='Show what would be deleted without actually deleting'
|
|
271
|
+
)
|
|
272
|
+
cleanup_parser.add_argument(
|
|
273
|
+
'--force',
|
|
274
|
+
action='store_true',
|
|
275
|
+
help='Skip confirmation prompt'
|
|
276
|
+
)
|
|
277
|
+
cleanup_parser.add_argument(
|
|
278
|
+
'--env',
|
|
279
|
+
type=str,
|
|
280
|
+
default=None,
|
|
281
|
+
help='Environment (local, dev, test, production)'
|
|
282
|
+
)
|
|
283
|
+
cleanup_parser.add_argument(
|
|
284
|
+
'--env-file',
|
|
285
|
+
type=str,
|
|
286
|
+
default=None,
|
|
287
|
+
help='Explicit path to a .env file'
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
# Parse command
|
|
291
|
+
parse_parser = subparsers.add_parser(
|
|
292
|
+
'parse',
|
|
293
|
+
help='Trigger parsing for documents in a dataset'
|
|
294
|
+
)
|
|
295
|
+
parse_parser.add_argument(
|
|
296
|
+
'--dataset',
|
|
297
|
+
type=str,
|
|
298
|
+
default=None,
|
|
299
|
+
help='Dataset name (defaults to configured dataset)'
|
|
300
|
+
)
|
|
301
|
+
parse_parser.add_argument(
|
|
302
|
+
'--force',
|
|
303
|
+
action='store_true',
|
|
304
|
+
help='Force re-parse ALL documents, even if already parsed'
|
|
305
|
+
)
|
|
306
|
+
parse_parser.add_argument(
|
|
307
|
+
'--dry-run',
|
|
308
|
+
action='store_true',
|
|
309
|
+
help='Show which documents would be parsed without actually parsing'
|
|
310
|
+
)
|
|
311
|
+
parse_parser.add_argument(
|
|
312
|
+
'--yes',
|
|
313
|
+
action='store_true',
|
|
314
|
+
help='Skip confirmation prompt'
|
|
315
|
+
)
|
|
316
|
+
parse_parser.add_argument(
|
|
317
|
+
'--parse-timeout',
|
|
318
|
+
type=int,
|
|
319
|
+
default=300,
|
|
320
|
+
help='Timeout for parsing in seconds (default: 300)'
|
|
321
|
+
)
|
|
322
|
+
parse_parser.add_argument(
|
|
323
|
+
'--env',
|
|
324
|
+
type=str,
|
|
325
|
+
default=None,
|
|
326
|
+
help='Environment (local, dev, test, production)'
|
|
327
|
+
)
|
|
328
|
+
parse_parser.add_argument(
|
|
329
|
+
'--env-file',
|
|
330
|
+
type=str,
|
|
331
|
+
default=None,
|
|
332
|
+
help='Explicit path to a .env file'
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Parse arguments
|
|
336
|
+
args = parser.parse_args()
|
|
337
|
+
|
|
338
|
+
# Check if command was provided
|
|
339
|
+
if not args.command:
|
|
340
|
+
parser.print_help()
|
|
341
|
+
return 1
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
# Load configuration
|
|
345
|
+
config = IMSConfig.from_env(env_file=args.env_file, environment=args.env)
|
|
346
|
+
|
|
347
|
+
# Validate configuration
|
|
348
|
+
config.validate()
|
|
349
|
+
|
|
350
|
+
# Initialize RAGFlow client
|
|
351
|
+
client = RAGFlowClient(
|
|
352
|
+
api_key=config.api_key,
|
|
353
|
+
base_url=config.base_url,
|
|
354
|
+
embedding_model=config.embedding_model,
|
|
355
|
+
chunk_method=config.chunk_method,
|
|
356
|
+
parser_config=config.parser_config,
|
|
357
|
+
page_size=config.page_size
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
return execute_command(args.command, args, client, config)
|
|
361
|
+
|
|
362
|
+
except ValueError as e:
|
|
363
|
+
print(f"ā Configuration error: {e}")
|
|
364
|
+
print("\nPlease ensure you have:")
|
|
365
|
+
print("1. Created a .env file (copy from env.template)")
|
|
366
|
+
print("2. Set RAGFLOW_BASE_URL and RAGFLOW_API_KEY")
|
|
367
|
+
return 1
|
|
368
|
+
except KeyboardInterrupt:
|
|
369
|
+
print("\n\nOperation cancelled.")
|
|
370
|
+
return 1
|
|
371
|
+
except Exception as e:
|
|
372
|
+
print(f"ā Unexpected error: {e}")
|
|
373
|
+
import traceback
|
|
374
|
+
traceback.print_exc()
|
|
375
|
+
return 1
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
if __name__ == '__main__':
|
|
379
|
+
sys.exit(main())
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Command Abstract Class
|
|
3
|
+
|
|
4
|
+
Defines the interface and common functionality for all IMS CLI commands.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import time
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
|
|
10
|
+
from ..ims_config import IMSConfig
|
|
11
|
+
from ..ragflow_client import RAGFlowClient
|
|
12
|
+
from ..typing_utils import CommandArgs
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseCommand(ABC):
|
|
16
|
+
"""
|
|
17
|
+
Abstract base class for all IMS CLI commands.
|
|
18
|
+
|
|
19
|
+
Provides common functionality for authentication, timing, and error handling.
|
|
20
|
+
Each command must implement the execute() method.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, client: RAGFlowClient, config: IMSConfig):
|
|
24
|
+
"""
|
|
25
|
+
Initialize command with RAGFlow client and configuration.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
client: RAGFlow client instance
|
|
29
|
+
config: IMS configuration
|
|
30
|
+
"""
|
|
31
|
+
self.client = client
|
|
32
|
+
self.config = config
|
|
33
|
+
self._start_time: float | None = None
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def execute(self, args: CommandArgs) -> int:
|
|
37
|
+
"""
|
|
38
|
+
Execute the command with given arguments.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
args: Parsed command-line arguments (argparse.Namespace)
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Exit code (0 for success, non-zero for failure)
|
|
45
|
+
"""
|
|
46
|
+
raise NotImplementedError
|
|
47
|
+
|
|
48
|
+
def _start_timing(self) -> None:
|
|
49
|
+
"""Start timing measurement for command execution."""
|
|
50
|
+
self._start_time = time.time()
|
|
51
|
+
|
|
52
|
+
def _get_elapsed_time(self) -> float:
|
|
53
|
+
"""
|
|
54
|
+
Get elapsed time since timing started.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Elapsed time in seconds
|
|
58
|
+
"""
|
|
59
|
+
if self._start_time is None:
|
|
60
|
+
return 0.0
|
|
61
|
+
return time.time() - self._start_time
|
|
62
|
+
|
|
63
|
+
def _print_timing(self, label: str = "Total time") -> None:
|
|
64
|
+
"""
|
|
65
|
+
Print timing information.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
label: Label for the timing output
|
|
69
|
+
"""
|
|
70
|
+
elapsed = self._get_elapsed_time()
|
|
71
|
+
print(f"ā±ļø {label}: {elapsed:.2f}s")
|
|
72
|
+
|
|
73
|
+
def _print_header(self, title: str) -> None:
|
|
74
|
+
"""
|
|
75
|
+
Print command header with configuration info.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
title: Command title/description
|
|
79
|
+
"""
|
|
80
|
+
print(title)
|
|
81
|
+
print(f"Environment: {self.config.environment}")
|
|
82
|
+
print(f"RAGFlow Instance: {self.config.base_url}")
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cleanup Command - Delete documents from a dataset
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from ..services.dataset_service import DatasetService
|
|
6
|
+
from ..services.document_service import DocumentService
|
|
7
|
+
|
|
8
|
+
from .base_command import BaseCommand
|
|
9
|
+
from ..typing_utils import CommandArgs, DatasetLike, DocumentLike
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CleanupCommand(BaseCommand):
|
|
13
|
+
"""
|
|
14
|
+
Delete documents from a RAGFlow dataset.
|
|
15
|
+
|
|
16
|
+
CRITICAL: Implements safety measures (dry-run, confirmation) to prevent accidental data loss.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def execute(self, args: CommandArgs) -> int:
|
|
20
|
+
"""Execute cleanup-dataset command."""
|
|
21
|
+
self._start_timing()
|
|
22
|
+
|
|
23
|
+
# Resolve dataset name
|
|
24
|
+
dataset_service = DatasetService(self.client, self.config)
|
|
25
|
+
dataset_name, auto_detected = dataset_service.resolve_dataset_name(args.dataset)
|
|
26
|
+
|
|
27
|
+
if not dataset_name:
|
|
28
|
+
return 1
|
|
29
|
+
|
|
30
|
+
# Print header
|
|
31
|
+
print(f"Cleaning up Dataset: {dataset_name}")
|
|
32
|
+
print(f"Environment: {self.config.environment}")
|
|
33
|
+
print(f"RAGFlow Instance: {self.config.base_url}\n")
|
|
34
|
+
|
|
35
|
+
# Verify authentication
|
|
36
|
+
from services.auth_service import AuthService
|
|
37
|
+
AuthService.verify_or_exit(self.client, self.config)
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
# Get dataset
|
|
41
|
+
dataset = self.client.get_dataset(name=dataset_name)
|
|
42
|
+
|
|
43
|
+
if not dataset:
|
|
44
|
+
print(f"ā Dataset '{dataset_name}' not found")
|
|
45
|
+
dataset_service.display_available_datasets()
|
|
46
|
+
return 1
|
|
47
|
+
|
|
48
|
+
# Filter documents
|
|
49
|
+
filtered_documents = self._get_filtered_documents(
|
|
50
|
+
dataset, dataset_service, args
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
if not filtered_documents:
|
|
54
|
+
print(f"ā No documents found")
|
|
55
|
+
return 0
|
|
56
|
+
|
|
57
|
+
# Show documents
|
|
58
|
+
self._display_documents(filtered_documents)
|
|
59
|
+
|
|
60
|
+
# Dry-run mode
|
|
61
|
+
if args.dry_run:
|
|
62
|
+
return self._handle_dry_run(filtered_documents)
|
|
63
|
+
|
|
64
|
+
# Confirm deletion
|
|
65
|
+
if not self._confirm_deletion(filtered_documents, dataset_name, args):
|
|
66
|
+
print("\nā Cleanup cancelled")
|
|
67
|
+
return 1
|
|
68
|
+
|
|
69
|
+
# Delete documents
|
|
70
|
+
deleted, failed = self._delete_documents(dataset, filtered_documents)
|
|
71
|
+
|
|
72
|
+
# Print summary
|
|
73
|
+
self._print_summary(deleted, failed)
|
|
74
|
+
|
|
75
|
+
return 0 if failed == 0 else 1
|
|
76
|
+
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(f"\nā Error cleaning up dataset: {e}")
|
|
79
|
+
self._print_timing()
|
|
80
|
+
import traceback
|
|
81
|
+
traceback.print_exc()
|
|
82
|
+
return 1
|
|
83
|
+
|
|
84
|
+
def _get_filtered_documents(
|
|
85
|
+
self,
|
|
86
|
+
dataset: DatasetLike,
|
|
87
|
+
dataset_service: DatasetService,
|
|
88
|
+
args: CommandArgs,
|
|
89
|
+
) -> list[DocumentLike]:
|
|
90
|
+
"""Get documents to delete based on filters."""
|
|
91
|
+
document_service = DocumentService(self.client)
|
|
92
|
+
|
|
93
|
+
if args.tags:
|
|
94
|
+
# Filter by tags (metadata condition)
|
|
95
|
+
tags_list = self._parse_tags(args.tags)
|
|
96
|
+
filtered_documents = document_service.filter_documents_by_tags(
|
|
97
|
+
dataset, tags_list
|
|
98
|
+
)
|
|
99
|
+
print(f"\nFiltered {len(filtered_documents)} document(s) with tags: {', '.join(tags_list)}\n")
|
|
100
|
+
elif args.prefix:
|
|
101
|
+
# Filter by prefix
|
|
102
|
+
filtered_documents = document_service.filter_documents_by_prefix(
|
|
103
|
+
dataset, args.prefix
|
|
104
|
+
)
|
|
105
|
+
print(f"\nFiltered {len(filtered_documents)} document(s) matching prefix '{args.prefix}'\n")
|
|
106
|
+
else:
|
|
107
|
+
# No filter - fetch all
|
|
108
|
+
filtered_documents = dataset.list_documents(page_size=self.config.page_size)
|
|
109
|
+
if filtered_documents:
|
|
110
|
+
print(f"\nFound {len(filtered_documents)} document(s) to delete\n")
|
|
111
|
+
|
|
112
|
+
return filtered_documents
|
|
113
|
+
|
|
114
|
+
def _parse_tags(self, tags_arg: str) -> list[str]:
|
|
115
|
+
"""Parse tags from comma or space separated string."""
|
|
116
|
+
# Support both comma and space separated tags
|
|
117
|
+
if ',' in tags_arg:
|
|
118
|
+
tags = [tag.strip() for tag in tags_arg.split(',')]
|
|
119
|
+
else:
|
|
120
|
+
tags = tags_arg.split()
|
|
121
|
+
|
|
122
|
+
# Filter out empty strings
|
|
123
|
+
return [tag for tag in tags if tag]
|
|
124
|
+
|
|
125
|
+
def _display_documents(self, documents: list[DocumentLike]) -> None:
|
|
126
|
+
"""Display documents that will be deleted."""
|
|
127
|
+
print("Documents to delete:")
|
|
128
|
+
print("="*80)
|
|
129
|
+
for document in documents:
|
|
130
|
+
doc_name = document.name if hasattr(document, 'name') else 'Untitled'
|
|
131
|
+
print(f" ⢠{doc_name}")
|
|
132
|
+
print("="*80)
|
|
133
|
+
print()
|
|
134
|
+
|
|
135
|
+
def _handle_dry_run(self, documents: list[DocumentLike]) -> int:
|
|
136
|
+
"""Handle dry-run mode."""
|
|
137
|
+
print("š DRY-RUN MODE - No documents will be deleted\n")
|
|
138
|
+
print(f"Summary: {len(documents)} document(s) would be deleted")
|
|
139
|
+
print("="*80)
|
|
140
|
+
self._print_timing()
|
|
141
|
+
return 0
|
|
142
|
+
|
|
143
|
+
def _confirm_deletion(
|
|
144
|
+
self,
|
|
145
|
+
documents: list[DocumentLike],
|
|
146
|
+
dataset_name: str,
|
|
147
|
+
args: CommandArgs,
|
|
148
|
+
) -> bool:
|
|
149
|
+
"""Confirm deletion with user (unless force flag set)."""
|
|
150
|
+
if args.force:
|
|
151
|
+
return True
|
|
152
|
+
|
|
153
|
+
# Build filter message
|
|
154
|
+
filter_msg = ""
|
|
155
|
+
if args.tags:
|
|
156
|
+
tags_list = self._parse_tags(args.tags)
|
|
157
|
+
filter_msg = f" with tags ({', '.join(tags_list)})"
|
|
158
|
+
elif args.prefix:
|
|
159
|
+
filter_msg = f" with prefix '{args.prefix}'"
|
|
160
|
+
|
|
161
|
+
response = input(
|
|
162
|
+
f"ā ļø Delete {len(documents)} documents{filter_msg} from '{dataset_name}'? (yes/no): "
|
|
163
|
+
)
|
|
164
|
+
return response.lower() in ['yes', 'y']
|
|
165
|
+
|
|
166
|
+
def _delete_documents(
|
|
167
|
+
self,
|
|
168
|
+
dataset: DatasetLike,
|
|
169
|
+
documents: list[DocumentLike],
|
|
170
|
+
) -> tuple[int, int]:
|
|
171
|
+
"""Delete documents and return counts."""
|
|
172
|
+
deleted_count = 0
|
|
173
|
+
failed_count = 0
|
|
174
|
+
|
|
175
|
+
# Collect document IDs
|
|
176
|
+
doc_ids_to_delete = []
|
|
177
|
+
doc_id_to_name = {}
|
|
178
|
+
|
|
179
|
+
for document in documents:
|
|
180
|
+
doc_id = document.id if hasattr(document, 'id') else None
|
|
181
|
+
doc_name = document.name if hasattr(document, 'name') else 'Untitled'
|
|
182
|
+
|
|
183
|
+
if not doc_id:
|
|
184
|
+
print(f"ā Cannot delete {doc_name}: No document ID")
|
|
185
|
+
failed_count += 1
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
doc_ids_to_delete.append(doc_id)
|
|
189
|
+
doc_id_to_name[doc_id] = doc_name
|
|
190
|
+
|
|
191
|
+
# Delete batch
|
|
192
|
+
if doc_ids_to_delete:
|
|
193
|
+
try:
|
|
194
|
+
dataset.delete_documents(ids=doc_ids_to_delete)
|
|
195
|
+
deleted_count = len(doc_ids_to_delete)
|
|
196
|
+
|
|
197
|
+
for doc_id in doc_ids_to_delete:
|
|
198
|
+
doc_name = doc_id_to_name.get(doc_id, 'Unknown')
|
|
199
|
+
print(f"ā Deleted: {doc_name}")
|
|
200
|
+
|
|
201
|
+
except Exception as e:
|
|
202
|
+
print(f"ā Failed to delete documents: {e}")
|
|
203
|
+
failed_count = len(doc_ids_to_delete)
|
|
204
|
+
|
|
205
|
+
return deleted_count, failed_count
|
|
206
|
+
|
|
207
|
+
def _print_summary(self, deleted: int, failed: int) -> None:
|
|
208
|
+
"""Print cleanup summary."""
|
|
209
|
+
print("\n" + "="*80)
|
|
210
|
+
print(f"Cleanup Summary:")
|
|
211
|
+
print(f" ā Deleted: {deleted}")
|
|
212
|
+
print(f" ā Failed: {failed}")
|
|
213
|
+
print("="*80)
|
|
214
|
+
self._print_timing()
|