fraclab-sdk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,513 @@
1
+ """Browse snapshot data page."""
2
+
3
+ import json
4
+ from typing import Any, Iterable
5
+
6
+ import pandas as pd
7
+ import streamlit as st
8
+
9
+ from fraclab_sdk.config import SDKConfig
10
+ from fraclab_sdk.snapshot import SnapshotLibrary
11
+ from fraclab_sdk.workbench import ui_styles
12
+
13
+ st.set_page_config(page_title="Browse", page_icon="🔍", layout="wide", initial_sidebar_state="expanded")
14
+ st.title("Browse")
15
+
16
+ ui_styles.apply_global_styles()
17
+
18
+ # --- Page-Specific CSS ---
19
+ st.markdown("""
20
+ <style>
21
+ /* Hide download button */
22
+ [data-testid="stDownloadButton"] {
23
+ display: none !important;
24
+ }
25
+
26
+ /* Pagination button styling (override global) */
27
+ div[data-testid="stButton"] button {
28
+ padding: 0.25rem 0.75rem !important;
29
+ min-width: 40px !important;
30
+ }
31
+
32
+ /* Pagination ellipsis styling */
33
+ .pagination-ellipsis {
34
+ text-align: center;
35
+ line-height: 2.3rem;
36
+ color: #888;
37
+ font-weight: bold;
38
+ }
39
+
40
+ /* Custom static table styling (replacement for st.dataframe) */
41
+ .table-wrapper {
42
+ max-height: 500px;
43
+ overflow: auto;
44
+ border: 1px solid #e6e9ef;
45
+ border-radius: 0.25rem;
46
+ margin-bottom: 1rem;
47
+ background-color: white;
48
+ }
49
+
50
+ .custom-table {
51
+ width: 100%;
52
+ border-collapse: collapse;
53
+ font-family: "Source Sans Pro", sans-serif;
54
+ font-size: 14px;
55
+ color: #31333F;
56
+ user-select: none !important;
57
+ }
58
+
59
+ /* Sticky table header */
60
+ .custom-table th {
61
+ position: sticky;
62
+ top: 0;
63
+ background-color: #f0f2f6;
64
+ color: #31333F;
65
+ z-index: 2;
66
+ padding: 8px 12px;
67
+ text-align: left;
68
+ font-weight: 600;
69
+ border-bottom: 2px solid #e6e9ef;
70
+ white-space: nowrap;
71
+ }
72
+
73
+ /* Table cell styling */
74
+ .custom-table td {
75
+ padding: 8px 12px;
76
+ border-bottom: 1px solid #f0f2f6;
77
+ white-space: nowrap;
78
+ vertical-align: middle;
79
+ }
80
+
81
+ /* Zebra striping */
82
+ .custom-table tr:nth-child(even) {
83
+ background-color: #f9f9fb;
84
+ }
85
+
86
+ /* Hover highlight */
87
+ .custom-table tr:hover {
88
+ background-color: #f1f3f8;
89
+ }
90
+ </style>
91
+ """, unsafe_allow_html=True)
92
+
93
+
94
+ # --- Utils & Components ---
95
+
96
+ def _render_static_table(df: pd.DataFrame):
97
+ """
98
+ Renders a static HTML table to replace st.dataframe.
99
+ Features: No menus, full content display (no truncation), sticky headers, custom styling.
100
+ """
101
+ # 预处理:填充 NaN 为空字符串,防止 HTML 显示 'nan'
102
+ df_display = df.fillna("")
103
+
104
+ # 转换为 HTML,不包含索引列(我们会在外面手动处理索引或不需要索引)
105
+ # escape=True 防止 XSS,但会转义 HTML 标签
106
+ html_table = df_display.to_html(index=False, classes="custom-table", border=0, escape=True)
107
+
108
+ # 渲染容器
109
+ st.markdown(f'<div class="table-wrapper">{html_table}</div>', unsafe_allow_html=True)
110
+
111
+
112
+ def _read_ndjson_slice(path, start: int, limit: int) -> list[tuple[int, dict]]:
113
+ """Read a slice of ndjson lines [start, start+limit)."""
114
+ results: list[tuple[int, dict]] = []
115
+ with path.open() as f:
116
+ for i, line in enumerate(f):
117
+ if i < start:
118
+ continue
119
+ if len(results) >= limit:
120
+ break
121
+ try:
122
+ results.append((i, json.loads(line)))
123
+ except Exception:
124
+ results.append((i, {"_error": "Failed to parse line", "raw": line.strip()}))
125
+ return results
126
+
127
+
128
+ def _render_pagination(current: int, total: int, key_prefix: str) -> int:
129
+ """
130
+ Render a compact, centered pagination bar.
131
+ Updates session state and reruns if clicked.
132
+ """
133
+ if f"{key_prefix}_current" not in st.session_state:
134
+ st.session_state[f"{key_prefix}_current"] = current
135
+
136
+ display_current = int(st.session_state.get(f"{key_prefix}_current", current))
137
+ clicked = False
138
+
139
+ def _page_buttons(cur: int) -> Iterable[int | str]:
140
+ if total <= 9:
141
+ return list(range(1, total + 1))
142
+ window = [cur - 1, cur, cur + 1]
143
+ window = [p for p in window if 1 <= p <= total]
144
+ pages = [1, 2] + window + [total - 1, total]
145
+ pages = sorted(set(pages))
146
+ display = []
147
+ last = None
148
+ for p in pages:
149
+ if last and p - last > 1:
150
+ display.append("…")
151
+ display.append(p)
152
+ last = p
153
+ return display
154
+
155
+ buttons = list(_page_buttons(display_current))
156
+
157
+ st.markdown("---")
158
+
159
+ num_slots = len(buttons) + 2
160
+ spacer_ratio = 6 if num_slots < 6 else 1.5
161
+ col_ratios = [spacer_ratio] + [1] * num_slots + [spacer_ratio]
162
+
163
+ cols = st.columns(col_ratios, gap="small")
164
+ action_cols = cols[1:-1]
165
+
166
+ chosen = display_current
167
+
168
+ if action_cols[0].button("‹", key=f"{key_prefix}_prev", disabled=display_current <= 1):
169
+ chosen = max(1, display_current - 1)
170
+ clicked = True
171
+
172
+ for idx, p in enumerate(buttons, start=1):
173
+ if p == "…":
174
+ action_cols[idx].markdown("<div class='pagination-ellipsis'>…</div>", unsafe_allow_html=True)
175
+ continue
176
+
177
+ if action_cols[idx].button(
178
+ f"{p}",
179
+ key=f"{key_prefix}_page_{p}",
180
+ type="primary" if p == display_current else "secondary",
181
+ ):
182
+ chosen = p
183
+ clicked = True
184
+
185
+ if action_cols[-1].button("›", key=f"{key_prefix}_next", disabled=display_current >= total):
186
+ chosen = min(total, display_current + 1)
187
+ clicked = True
188
+
189
+ st.session_state[f"{key_prefix}_current"] = chosen
190
+
191
+ if clicked:
192
+ try:
193
+ st.rerun()
194
+ except AttributeError:
195
+ st.experimental_rerun()
196
+ return chosen
197
+
198
+
199
+ def _detect_layout(dir_path) -> str | None:
200
+ if (dir_path / "object.ndjson").exists():
201
+ return "object_ndjson_lines"
202
+ if (dir_path / "parquet").exists():
203
+ return "frame_parquet_item_dirs"
204
+ return None
205
+
206
+
207
+ def get_library():
208
+ config = SDKConfig()
209
+ return SnapshotLibrary(config)
210
+
211
+
212
+ # --- Main Logic ---
213
+
214
+ snapshot_lib = get_library()
215
+ snapshots = snapshot_lib.list_snapshots()
216
+
217
+ if not snapshots:
218
+ st.info("No snapshots available. Import a snapshot first.")
219
+ st.stop()
220
+
221
+ # 1. Select Snapshot
222
+ snapshot_options = {s.snapshot_id: s for s in snapshots}
223
+ selected_id = st.selectbox(
224
+ "Select Snapshot",
225
+ options=list(snapshot_options.keys()),
226
+ format_func=lambda x: f"{x} ({snapshot_options[x].bundle_id})",
227
+ )
228
+
229
+ if not selected_id:
230
+ st.stop()
231
+
232
+ snapshot = snapshot_lib.get_snapshot(selected_id)
233
+
234
+ st.divider()
235
+
236
+ # 2. Select Dataset
237
+ st.subheader("Datasets")
238
+ datasets = snapshot.get_datasets()
239
+
240
+ if not datasets:
241
+ st.info("No datasets in this snapshot")
242
+ st.stop()
243
+
244
+ dataset_options = {d["dataset_key"]: d for d in datasets}
245
+ selected_dataset_key = st.selectbox(
246
+ "Select Dataset",
247
+ options=list(dataset_options.keys()),
248
+ format_func=lambda k: f"{k} ({dataset_options[k]['item_count']} items)",
249
+ )
250
+
251
+ if not selected_dataset_key:
252
+ st.stop()
253
+
254
+ dataset_info = dataset_options[selected_dataset_key]
255
+
256
+ with st.container(border=True):
257
+ c1, c2, c3 = st.columns(3)
258
+ c1.metric("Total Items", dataset_info["item_count"])
259
+ c2.caption("Layout")
260
+ c2.markdown(f"**{dataset_info['layout'] or 'N/A'}**")
261
+ c3.caption("Resource Type")
262
+ c3.markdown(f"**{dataset_info['resource_type'] or 'N/A'}**")
263
+
264
+ st.divider()
265
+
266
+ # 3. Items Explorer
267
+ st.subheader("Items Explorer")
268
+
269
+ items = snapshot.get_items(selected_dataset_key)
270
+ layout = dataset_info["layout"]
271
+
272
+ if not items:
273
+ st.info("No items in this dataset")
274
+ else:
275
+ # --- Pagination for Items ---
276
+ items_per_page = 20
277
+ total_items = len(items)
278
+ total_pages = (total_items + items_per_page - 1) // items_per_page
279
+
280
+ page_key = f"items_page_{selected_dataset_key}"
281
+ page = st.session_state.get(f"{page_key}_current", st.session_state.get(page_key, 1))
282
+
283
+ start_idx = (page - 1) * items_per_page
284
+ end_idx = min(start_idx + items_per_page, total_items)
285
+
286
+ current_items_slice = items[start_idx:end_idx]
287
+
288
+ # --- Prepare Data ---
289
+ item_dicts = []
290
+ if current_items_slice:
291
+ for real_idx, item_obj in current_items_slice:
292
+ try:
293
+ d = item_obj.model_dump(exclude_none=True)
294
+ d["_index"] = real_idx
295
+ item_dicts.append(d)
296
+ except AttributeError:
297
+ item_dicts.append({"_index": real_idx, "raw": str(item_obj)})
298
+
299
+ # --- View Tabs ---
300
+ tab_table, tab_cards = st.tabs(["📊 Table View", "📝 Detail Cards"])
301
+
302
+ with tab_table:
303
+ st.markdown(f"<small>Showing items {start_idx + 1}-{end_idx} of {total_items}</small>", unsafe_allow_html=True)
304
+
305
+ if item_dicts:
306
+ df = pd.DataFrame(item_dicts)
307
+
308
+ # Reorder columns
309
+ cols = df.columns.tolist()
310
+ if "_index" in cols:
311
+ cols.insert(0, cols.pop(cols.index("_index")))
312
+ df = df[cols]
313
+
314
+ # 使用自定义静态表格替代 st.dataframe
315
+ _render_static_table(df)
316
+
317
+ else:
318
+ st.warning("No data to display.")
319
+
320
+ with tab_cards:
321
+ st.markdown(f"<small>Showing items {start_idx + 1}-{end_idx} of {total_items}</small>", unsafe_allow_html=True)
322
+ for real_idx, item_obj in current_items_slice:
323
+ with st.expander(f"**Item {real_idx}**", expanded=False):
324
+ try:
325
+ json_str = json.dumps(item_obj.model_dump(exclude_none=True), indent=2, ensure_ascii=False)
326
+ st.code(json_str, language="json")
327
+ except AttributeError:
328
+ st.text(str(item_obj))
329
+
330
+ if layout == "object_ndjson_lines":
331
+ if st.button(f"Load Data #{real_idx}", key=f"btn_load_ndjson_{real_idx}_{selected_dataset_key}"):
332
+ try:
333
+ data = snapshot.read_object_line(selected_dataset_key, real_idx)
334
+ st.info("Data Content:")
335
+ st.code(json.dumps(data, indent=2, ensure_ascii=False), language="json")
336
+ except Exception as e:
337
+ st.error(f"Error: {e}")
338
+
339
+ elif layout == "frame_parquet_item_dirs":
340
+ try:
341
+ files = snapshot.read_frame_parts(selected_dataset_key, real_idx)
342
+ if files:
343
+ st.markdown("**Parquet Files:**")
344
+ for f in files:
345
+ st.code(f.name, language="text")
346
+ else:
347
+ st.caption("No files found.")
348
+ except Exception as e:
349
+ st.error(f"Error: {e}")
350
+
351
+ if total_pages > 1:
352
+ center_cols = st.columns([1, 8, 1])
353
+ with center_cols[1]:
354
+ page = _render_pagination(page, total_pages, page_key)
355
+
356
+ st.divider()
357
+
358
+ # 4. Data Files Preview
359
+ st.subheader("Data Files (from data/)")
360
+
361
+ manifest = snapshot.manifest
362
+ manifest_ds = manifest.datasets.get(selected_dataset_key)
363
+ data_root = manifest.dataRoot or "data"
364
+ dataset_dir = snapshot.directory / data_root / selected_dataset_key
365
+
366
+ resolved_layout = layout or (manifest_ds.layout if manifest_ds else None) or _detect_layout(dataset_dir)
367
+
368
+ if resolved_layout == "object_ndjson_lines":
369
+ # --- NDJSON View ---
370
+ ndjson_path = dataset_dir / "object.ndjson"
371
+ if ndjson_path.exists():
372
+ total_count = manifest_ds.count if manifest_ds else dataset_info["item_count"]
373
+ st.caption(f"File: {ndjson_path} (count: {total_count})")
374
+
375
+ page_size = 10
376
+ total_pages = (total_count + page_size - 1) // page_size or 1
377
+
378
+ ndjson_page_key = f"ndjson_preview_page_{selected_dataset_key}"
379
+ cp = st.session_state.get(f"{ndjson_page_key}_current", st.session_state.get(ndjson_page_key, 1))
380
+
381
+ if cp > total_pages: cp = 1
382
+
383
+ start = (cp - 1) * page_size
384
+ limit = page_size
385
+
386
+ st.text(f"Lines {start + 1}-{min(start + limit, total_count)}")
387
+
388
+ lines_data = _read_ndjson_slice(ndjson_path, start, limit)
389
+ for line_idx, obj in lines_data:
390
+ with st.expander(f"Line {line_idx}", expanded=False):
391
+ st.code(json.dumps(obj, indent=2, ensure_ascii=False), language="json")
392
+
393
+ if total_pages > 1:
394
+ center_cols = st.columns([1, 8, 1])
395
+ with center_cols[1]:
396
+ _render_pagination(cp, total_pages, ndjson_page_key)
397
+ else:
398
+ st.warning(f"File not found: {ndjson_path}")
399
+
400
+ elif resolved_layout == "frame_parquet_item_dirs":
401
+ # --- Parquet View ---
402
+ parquet_dir = dataset_dir / "parquet"
403
+ search_dir = parquet_dir if parquet_dir.exists() else dataset_dir
404
+
405
+ if search_dir.exists():
406
+ st.caption(f"Searching Parquet in: {search_dir}")
407
+ files = sorted(search_dir.rglob("*.parquet"))
408
+
409
+ if not files:
410
+ st.info("No parquet files found.")
411
+ else:
412
+ # --- File List Pagination ---
413
+ page_size = 10
414
+ total_pages = (len(files) + page_size - 1) // page_size or 1
415
+
416
+ file_page_key = f"parquet_file_page_{selected_dataset_key}"
417
+ cp_files = st.session_state.get(f"{file_page_key}_current", 1)
418
+
419
+ if cp_files > total_pages: cp_files = 1
420
+
421
+ start = (cp_files - 1) * page_size
422
+ page_files = files[start : start + page_size]
423
+
424
+ # Display File List
425
+ with st.container(border=True):
426
+ st.markdown(f"**Parquet Files (Page {cp_files}/{total_pages})**")
427
+ for f in page_files:
428
+ st.text(f"📄 {f.relative_to(dataset_dir)}")
429
+
430
+ if total_pages > 1:
431
+ center_cols = st.columns([1, 8, 1])
432
+ with center_cols[1]:
433
+ _render_pagination(cp_files, total_pages, file_page_key)
434
+
435
+ st.divider()
436
+
437
+ # --- File Selection Logic ---
438
+ options = [f.relative_to(dataset_dir) for f in files]
439
+ select_key = f"parquet_file_select_{selected_dataset_key}"
440
+
441
+ selected_rel = st.selectbox(
442
+ "Select file to preview content",
443
+ options=options,
444
+ key=select_key,
445
+ )
446
+
447
+ # --- Parquet Content Preview ---
448
+ sample_path = dataset_dir / selected_rel
449
+
450
+ st.markdown(f"#### Preview: `{sample_path.name}`")
451
+
452
+ try:
453
+ import pyarrow.parquet as pq
454
+
455
+ table = pq.read_table(sample_path)
456
+ total_rows = table.num_rows
457
+
458
+ if total_rows == 0:
459
+ st.warning("Empty file.")
460
+ else:
461
+ row_page_size = 20
462
+ row_total_pages = (total_rows + row_page_size - 1) // row_page_size or 1
463
+
464
+ preview_page_key = f"pq_view_{selected_dataset_key}_{str(selected_rel)}"
465
+
466
+ cp_row = st.session_state.get(f"{preview_page_key}_current", 1)
467
+ if cp_row > row_total_pages: cp_row = 1
468
+
469
+ start_r = (cp_row - 1) * row_page_size
470
+ table_slice = table.slice(start_r, row_page_size)
471
+
472
+ cols = table_slice.column_names
473
+ data_dict = table_slice.to_pydict()
474
+ rows = [{col: data_dict[col][i] for col in cols} for i in range(table_slice.num_rows)]
475
+
476
+ df_rows = pd.DataFrame(rows)
477
+
478
+ # --- 时间戳优化处理 ---
479
+ for col in df_rows.columns:
480
+ if pd.api.types.is_datetime64_any_dtype(df_rows[col]):
481
+ try:
482
+ df_rows[col] = df_rows[col].dt.round('1s')
483
+ if df_rows[col].dt.tz is not None:
484
+ df_rows[col] = df_rows[col].dt.tz_localize(None)
485
+ except Exception:
486
+ pass
487
+
488
+ # 使用自定义静态表格替代 st.dataframe
489
+ _render_static_table(df_rows)
490
+
491
+ if row_total_pages > 1:
492
+ st.caption(f"Page {cp_row} of {row_total_pages} ({total_rows} rows)")
493
+ center_cols = st.columns([1, 8, 1])
494
+ with center_cols[1]:
495
+ _render_pagination(cp_row, row_total_pages, preview_page_key)
496
+
497
+ except ImportError:
498
+ st.error("pyarrow not installed.")
499
+ except Exception as e:
500
+ st.error(f"Failed to read parquet: {e}")
501
+
502
+ else:
503
+ st.warning(f"Parquet directory not found: {search_dir}")
504
+
505
+ st.divider()
506
+
507
+ # DRS info
508
+ with st.expander("Show DRS (Data Requirement Specification)"):
509
+ try:
510
+ drs = snapshot.drs
511
+ st.code(json.dumps(drs.model_dump(exclude_none=True), indent=2, ensure_ascii=False), language="json")
512
+ except Exception as e:
513
+ st.error(f"Failed to load DRS: {e}")