fraclab-sdk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,464 @@
1
+ """Selection and configuration page."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ import pandas as pd
7
+ import streamlit as st
8
+
9
+ from fraclab_sdk.algorithm import AlgorithmLibrary
10
+ from fraclab_sdk.config import SDKConfig
11
+ from fraclab_sdk.run import RunManager
12
+ from fraclab_sdk.selection.model import SelectionModel
13
+ from fraclab_sdk.snapshot import SnapshotLibrary
14
+ from fraclab_sdk.workbench import ui_styles
15
+
16
+ st.set_page_config(page_title="Selection", page_icon="✅", layout="wide", initial_sidebar_state="expanded")
17
+ st.title("Selection")
18
+
19
+ ui_styles.apply_global_styles()
20
+
21
+ # --- Page-Specific CSS ---
22
+ st.markdown("""
23
+ <style>
24
+ /* Hide Data Editor header buttons (sort arrows, menu) */
25
+ [data-testid="stDataEditor"] th button {
26
+ display: none !important;
27
+ }
28
+ /* Hide row number column if present */
29
+ [data-testid="stDataEditor"] td[aria-selected="false"] {
30
+ color: transparent !important;
31
+ }
32
+ </style>
33
+ """, unsafe_allow_html=True)
34
+
35
+
36
+ def get_libraries():
37
+ """Get SDK libraries."""
38
+ config = SDKConfig()
39
+ return (
40
+ SnapshotLibrary(config),
41
+ AlgorithmLibrary(config),
42
+ RunManager(config),
43
+ )
44
+
45
+
46
+ snapshot_lib, algorithm_lib, run_manager = get_libraries()
47
+
48
+ # Initialize session state
49
+ if "selection_model" not in st.session_state:
50
+ st.session_state.selection_model = None
51
+ if "selection_triggers" not in st.session_state:
52
+ st.session_state.selection_triggers = {}
53
+
54
+ snapshots = snapshot_lib.list_snapshots()
55
+ algorithms = algorithm_lib.list_algorithms()
56
+
57
+ if not snapshots:
58
+ st.info("No snapshots available. Import a snapshot first.")
59
+ st.stop()
60
+
61
+ if not algorithms:
62
+ st.info("No algorithms available. Import an algorithm first.")
63
+ st.stop()
64
+
65
+
66
+ # --- Helper Functions ---
67
+
68
+ def _detect_layout(dir_path: Path) -> str | None:
69
+ """Best-effort layout detection from on-disk files (Copied from Browse)."""
70
+ if not dir_path.exists():
71
+ return None
72
+ if (dir_path / "object.ndjson").exists():
73
+ return "object_ndjson_lines"
74
+ if (dir_path / "parquet").exists():
75
+ return "frame_parquet_item_dirs"
76
+ # Fallback: check if any parquet files exist in subdirs
77
+ if list(dir_path.rglob("*.parquet")):
78
+ return "frame_parquet_item_dirs"
79
+ return None
80
+
81
+
82
+ # ==========================================
83
+ # Dialogs
84
+ # ==========================================
85
+ @st.dialog("Data Requirement Specification (DRS)")
86
+ def show_drs_dialog(drs_data: dict):
87
+ st.caption("This defines the data structure required by the snapshot.")
88
+ st.code(json.dumps(drs_data, indent=2, ensure_ascii=False), language="json")
89
+
90
+
91
+ # ==========================================
92
+ # 1 & 2. Context Selection (Snapshot & Algo)
93
+ # ==========================================
94
+ st.subheader("1. Configuration Context")
95
+
96
+ col_snap, col_algo = st.columns(2)
97
+
98
+ # --- Left: Snapshot ---
99
+ with col_snap:
100
+ with st.container(border=True):
101
+ st.markdown("#### 📦 Snapshot")
102
+ snapshot_options = {s.snapshot_id: s for s in snapshots}
103
+
104
+ selected_snapshot_id = st.selectbox(
105
+ "Select Snapshot",
106
+ options=list(snapshot_options.keys()),
107
+ format_func=lambda x: f"{x}",
108
+ label_visibility="collapsed"
109
+ )
110
+
111
+ if selected_snapshot_id:
112
+ snap_obj = snapshot_options[selected_snapshot_id]
113
+
114
+ sc1, sc2 = st.columns([3, 1])
115
+ with sc1:
116
+ st.caption(f"**Bundle ID:** `{snap_obj.bundle_id}`")
117
+ st.caption(f"**Imported:** {snap_obj.imported_at}")
118
+ with sc2:
119
+ # Updated API: width="stretch"
120
+ if st.button("📜 DRS", key=f"view_drs_{selected_snapshot_id}", help="View Data Requirements", width="stretch"):
121
+ try:
122
+ full_snap = snapshot_lib.get_snapshot(snap_obj.snapshot_id)
123
+ drs_data = full_snap.drs.model_dump(exclude_none=True)
124
+ show_drs_dialog(drs_data)
125
+ except Exception as e:
126
+ st.error(f"Cannot load DRS: {e}")
127
+
128
+ if snap_obj.description:
129
+ st.info(snap_obj.description)
130
+
131
+ # --- Right: Algorithm ---
132
+ with col_algo:
133
+ with st.container(border=True):
134
+ st.markdown("#### 🧩 Algorithm")
135
+ algo_options = {f"{a.algorithm_id}:{a.version}": a for a in algorithms}
136
+
137
+ selected_algo_key = st.selectbox(
138
+ "Select Algorithm",
139
+ options=list(algo_options.keys()),
140
+ format_func=lambda k: f"{algo_options[k].name or algo_options[k].algorithm_id} (v{algo_options[k].version})",
141
+ label_visibility="collapsed"
142
+ )
143
+
144
+ if selected_algo_key:
145
+ algo_obj = algo_options[selected_algo_key]
146
+ st.caption(f"**Contract:** `{algo_obj.contract_version}`")
147
+ authors = getattr(algo_obj, "authors", [])
148
+ if authors:
149
+ author_names = ", ".join([a.get("name", "Unknown") for a in authors])
150
+ st.caption(f"**Authors:** {author_names}")
151
+
152
+ if getattr(algo_obj, "summary", ""):
153
+ st.info(algo_obj.summary)
154
+
155
+
156
+ # Initialize Logic
157
+ if selected_snapshot_id and selected_algo_key:
158
+ snapshot = snapshot_lib.get_snapshot(selected_snapshot_id)
159
+ algo = algo_options[selected_algo_key]
160
+ algorithm = algorithm_lib.get_algorithm(algo.algorithm_id, algo.version)
161
+
162
+ current_snap_id = st.session_state.get("selection_snapshot_id")
163
+ current_algo_id = st.session_state.get("selection_algorithm_id")
164
+ current_algo_ver = st.session_state.get("selection_algorithm_version")
165
+
166
+ if (current_snap_id != selected_snapshot_id or
167
+ current_algo_id != algo.algorithm_id or
168
+ current_algo_ver != algo.version):
169
+
170
+ try:
171
+ selection_model = SelectionModel.from_snapshot_and_drs(snapshot, algorithm.drs)
172
+ st.session_state.selection_model = selection_model
173
+ st.session_state.selection_snapshot_id = selected_snapshot_id
174
+ st.session_state.selection_algorithm_id = algo.algorithm_id
175
+ st.session_state.selection_algorithm_version = algo.version
176
+ st.session_state.selection_triggers = {}
177
+ except Exception as e:
178
+ st.error(f"Failed to create selection model: {e}")
179
+ st.stop()
180
+ else:
181
+ selection_model = st.session_state.selection_model
182
+
183
+ st.divider()
184
+
185
+ # ==========================================
186
+ # 3. Data Selection
187
+ # ==========================================
188
+ st.subheader("2. Data Selection")
189
+
190
+ selectable = selection_model.get_selectable_datasets()
191
+
192
+ if not selectable:
193
+ st.warning("This algorithm does not require any specific dataset selection (DRS is empty).")
194
+
195
+ for ds in selectable:
196
+ dataset_key = ds.dataset_key
197
+
198
+ with st.container(border=True):
199
+ head_c1, head_c2 = st.columns([4, 1])
200
+ with head_c1:
201
+ st.markdown(f"##### 🗃️ {dataset_key}")
202
+ if ds.description:
203
+ st.caption(ds.description)
204
+ with head_c2:
205
+ st.caption(f"Req: **{ds.cardinality}**")
206
+ st.caption(f"Total: **{ds.total_items}**")
207
+
208
+ # --- Layout Detection Logic (multi-level fallback) ---
209
+ resolved_layout = None
210
+
211
+ # 1. Try dataspec (ds.json)
212
+ try:
213
+ resolved_layout = snapshot.get_layout(dataset_key)
214
+ except Exception:
215
+ pass
216
+
217
+ # 2. Try bundle manifest (manifest.json) - always has layout
218
+ if not resolved_layout:
219
+ try:
220
+ manifest_ds = snapshot.manifest.datasets.get(dataset_key)
221
+ if manifest_ds:
222
+ resolved_layout = manifest_ds.layout
223
+ except Exception:
224
+ pass
225
+
226
+ # 3. Fallback to filesystem auto-detection
227
+ if not resolved_layout:
228
+ data_root = snapshot.manifest.dataRoot or "data"
229
+ dataset_dir = snapshot.directory / data_root / dataset_key
230
+ resolved_layout = _detect_layout(dataset_dir)
231
+
232
+ items = snapshot.get_items(dataset_key)
233
+
234
+ # Pre-compute data paths for this dataset
235
+ data_root = snapshot.manifest.dataRoot or "data"
236
+ dataset_data_dir = snapshot.directory / data_root / dataset_key
237
+
238
+ # --- Helper to check status (prioritize warnings) ---
239
+ def _get_item_status(idx: int, layout_type: str | None) -> tuple[str, str]:
240
+ """Check item file status. Prioritize Empty/Missing warnings over format."""
241
+
242
+ def _check_parquet_item(item_dir: Path) -> tuple[str, str]:
243
+ """Check parquet item directory for issues."""
244
+ import pyarrow.parquet as pq
245
+
246
+ if not item_dir.exists():
247
+ return "⚠️ Missing", f"Directory: {item_dir.name}"
248
+ parquet_files = list(item_dir.rglob("*.parquet"))
249
+ if not parquet_files:
250
+ return "⚠️ Empty", "No .parquet files"
251
+
252
+ # Check for zero-byte files
253
+ zero_byte_files = [f for f in parquet_files if f.stat().st_size == 0]
254
+ if zero_byte_files:
255
+ return "⚠️ Empty", f"{len(zero_byte_files)}/{len(parquet_files)} files are 0 bytes"
256
+
257
+ # Check for parquet files with metadata but 0 rows
258
+ total_rows = 0
259
+ empty_row_files = []
260
+ for pf in parquet_files:
261
+ try:
262
+ meta = pq.read_metadata(pf)
263
+ if meta.num_rows == 0:
264
+ empty_row_files.append(pf)
265
+ total_rows += meta.num_rows
266
+ except Exception:
267
+ pass # If can't read metadata, skip this check
268
+
269
+ if empty_row_files and len(empty_row_files) == len(parquet_files):
270
+ return "⚠️ Empty", "All files have 0 rows"
271
+ if empty_row_files:
272
+ return "⚠️ Partial", f"{len(empty_row_files)}/{len(parquet_files)} files have 0 rows"
273
+
274
+ return "✓ Parquet", f"{len(parquet_files)} file(s), {total_rows:,} rows"
275
+
276
+ if layout_type == "frame_parquet_item_dirs":
277
+ item_dir = dataset_data_dir / "parquet" / f"item-{idx:05d}"
278
+ return _check_parquet_item(item_dir)
279
+
280
+ elif layout_type == "object_ndjson_lines":
281
+ ndjson_path = dataset_data_dir / "object.ndjson"
282
+ if not ndjson_path.exists():
283
+ return "⚠️ Missing", "object.ndjson not found"
284
+ if ndjson_path.stat().st_size == 0:
285
+ return "⚠️ Empty", "object.ndjson is 0 bytes"
286
+ return "✓ NDJSON", "OK"
287
+
288
+ # Layout not detected - try to infer from files
289
+ ndjson_path = dataset_data_dir / "object.ndjson"
290
+ if ndjson_path.exists():
291
+ if ndjson_path.stat().st_size == 0:
292
+ return "⚠️ Empty", "object.ndjson is 0 bytes"
293
+ return "✓ NDJSON", "Auto-detected"
294
+
295
+ parquet_dir = dataset_data_dir / "parquet"
296
+ if parquet_dir.exists():
297
+ item_dir = parquet_dir / f"item-{idx:05d}"
298
+ return _check_parquet_item(item_dir)
299
+
300
+ return "❓ Unknown", "No data files found"
301
+
302
+ # --- CASE A: Single Selection ---
303
+ if ds.cardinality == "one":
304
+ options = list(range(len(items)))
305
+
306
+ def _fmt_single(idx):
307
+ status, _ = _get_item_status(idx, resolved_layout)
308
+ if "Empty" in status:
309
+ return f"Item {idx} (⚠️ Empty)"
310
+ return f"Item {idx} ({status})"
311
+
312
+ selected_idx = st.selectbox(
313
+ f"Select item for {dataset_key}",
314
+ options=options,
315
+ format_func=_fmt_single,
316
+ key=f"select_{dataset_key}"
317
+ )
318
+
319
+ if selected_idx is not None:
320
+ selection_model.set_selected(dataset_key, [selected_idx])
321
+
322
+ # --- CASE B: Multi Selection (Data Editor) ---
323
+ else:
324
+ current_selected_set = set(selection_model.get_selected(dataset_key))
325
+
326
+ rows = []
327
+ for idx, _ in items:
328
+ status_label, detail_help = _get_item_status(idx, resolved_layout)
329
+
330
+ rows.append({
331
+ "Selected": idx in current_selected_set,
332
+ "Index": idx,
333
+ "Type": status_label,
334
+ "_help": detail_help
335
+ })
336
+
337
+ df_items = pd.DataFrame(rows)
338
+
339
+ # Action Buttons
340
+ editor_key = f"editor_{dataset_key}"
341
+
342
+ col_btns, col_status = st.columns([2, 3])
343
+ with col_btns:
344
+ b_c1, b_c2, _ = st.columns([1, 1, 2], gap="small")
345
+ with b_c1:
346
+ # Updated API: width="stretch"
347
+ if st.button("All", key=f"all_{dataset_key}", width="stretch"):
348
+ all_indices = [r["Index"] for r in rows]
349
+ selection_model.set_selected(dataset_key, all_indices)
350
+ st.rerun()
351
+ with b_c2:
352
+ # Updated API: width="stretch"
353
+ if st.button("None", key=f"none_{dataset_key}", width="stretch"):
354
+ selection_model.set_selected(dataset_key, [])
355
+ st.rerun()
356
+
357
+ with col_status:
358
+ st.markdown(f"<div style='text-align:right; color:#666; padding-top:5px;'>Selected: <b>{len(current_selected_set)}</b> / {len(items)}</div>", unsafe_allow_html=True)
359
+
360
+ # Render Data Editor
361
+ # Updated API: use width="stretch" instead of use_container_width
362
+ edited_df = st.data_editor(
363
+ df_items,
364
+ key=editor_key,
365
+ height=300,
366
+ width="stretch",
367
+ hide_index=True,
368
+ num_rows="fixed",
369
+ column_config={
370
+ "Selected": st.column_config.CheckboxColumn(
371
+ "Select",
372
+ width="small",
373
+ default=False
374
+ ),
375
+ "Index": st.column_config.NumberColumn(
376
+ "Item ID",
377
+ format="%d",
378
+ width="small",
379
+ disabled=True
380
+ ),
381
+ "Type": st.column_config.TextColumn(
382
+ "File Type / Status",
383
+ width="medium",
384
+ disabled=True,
385
+ help="Shows file format or warns if file is empty"
386
+ ),
387
+ "_help": None # Hide internal column
388
+ }
389
+ )
390
+
391
+ new_selected_indices = edited_df[edited_df["Selected"]]["Index"].tolist()
392
+
393
+ if set(new_selected_indices) != current_selected_set:
394
+ selection_model.set_selected(dataset_key, new_selected_indices)
395
+ st.rerun()
396
+
397
+ st.divider()
398
+
399
+ # ==========================================
400
+ # 4. Validation & Parameters
401
+ # ==========================================
402
+
403
+ col_valid, col_params = st.columns([1, 1], gap="large")
404
+
405
+ with col_valid:
406
+ st.subheader("3. Validation")
407
+ errors = selection_model.validate()
408
+
409
+ with st.container(border=True):
410
+ if errors:
411
+ for err in errors:
412
+ st.error(f"**{err.dataset_key}**: {err.message}", icon="🚫")
413
+ else:
414
+ st.success("All selection requirements met.", icon="✅")
415
+
416
+ with col_params:
417
+ st.subheader("4. Parameters")
418
+ params_schema = algorithm.params_schema
419
+
420
+ defaults = {}
421
+ if "properties" in params_schema:
422
+ for key, prop in params_schema["properties"].items():
423
+ if "default" in prop:
424
+ defaults[key] = prop["default"]
425
+
426
+ with st.expander("Parameters Configuration", expanded=True):
427
+ params_json = st.text_area(
428
+ "JSON Input",
429
+ value=json.dumps(defaults, indent=2),
430
+ height=200,
431
+ help="Enter algorithm parameters as JSON",
432
+ label_visibility="collapsed"
433
+ )
434
+
435
+ try:
436
+ params = json.loads(params_json) if params_json.strip() else {}
437
+ except json.JSONDecodeError as e:
438
+ st.error(f"Invalid JSON: {e}")
439
+ params = None
440
+
441
+ st.divider()
442
+
443
+ # ==========================================
444
+ # 5. Execution
445
+ # ==========================================
446
+
447
+ col_spacer, col_action = st.columns([3, 1])
448
+
449
+ with col_action:
450
+ create_disabled = bool(errors) or (params is None)
451
+ # Updated API: width="stretch"
452
+ if st.button("🚀 Create & Start Run", type="primary", disabled=create_disabled, width="stretch"):
453
+ try:
454
+ run_id = run_manager.create_run(
455
+ snapshot_id=selected_snapshot_id,
456
+ algorithm_id=algo.algorithm_id,
457
+ algorithm_version=algo.version,
458
+ selection=selection_model,
459
+ params=params,
460
+ )
461
+ st.session_state.created_run_id = run_id
462
+ st.switch_page("pages/4_Run.py")
463
+ except Exception as e:
464
+ st.error(f"Failed to create run: {e}")