caption-flow 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,694 @@
1
+ """Arrow/Parquet storage management with list column support for captions."""
2
+
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ from dataclasses import asdict
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from typing import List, Optional, Set, Dict, Any
10
+ import pyarrow as pa
11
+ import pyarrow.parquet as pq
12
+ from pyarrow import fs
13
+ import pandas as pd
14
+
15
+ from .models import Job, Caption, Contributor, JobStatus
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class StorageManager:
21
+ """Manages Arrow/Parquet storage for captions and jobs with list column support."""
22
+
23
+ def __init__(
24
+ self,
25
+ data_dir: Path,
26
+ caption_buffer_size: int = 100,
27
+ job_buffer_size: int = 100,
28
+ contributor_buffer_size: int = 10,
29
+ ):
30
+ self.data_dir = Path(data_dir)
31
+ self.data_dir.mkdir(parents=True, exist_ok=True)
32
+
33
+ # File paths
34
+ self.captions_path = self.data_dir / "captions.parquet"
35
+ self.jobs_path = self.data_dir / "jobs.parquet"
36
+ self.contributors_path = self.data_dir / "contributors.parquet"
37
+
38
+ # In-memory buffers for batching writes
39
+ self.caption_buffer = []
40
+ self.job_buffer = []
41
+ self.contributor_buffer = []
42
+
43
+ # Buffer size configuration
44
+ self.caption_buffer_size = caption_buffer_size
45
+ self.job_buffer_size = job_buffer_size
46
+ self.contributor_buffer_size = contributor_buffer_size
47
+
48
+ # Track existing job_ids to prevent duplicates
49
+ self.existing_contributor_ids: Set[str] = set()
50
+ self.existing_caption_job_ids: Set[str] = set()
51
+ self.existing_job_ids: Set[str] = set()
52
+
53
+ # Statistics
54
+ self.total_captions_written = 0
55
+ self.total_caption_entries_written = 0 # Total individual captions
56
+ self.total_flushes = 0
57
+ self.duplicates_skipped = 0
58
+
59
+ # Schemas - Updated caption schema to support list of captions
60
+ self.caption_schema = pa.schema(
61
+ [
62
+ ("job_id", pa.string()),
63
+ ("dataset", pa.string()),
64
+ ("shard", pa.string()),
65
+ ("chunk_id", pa.string()),
66
+ ("item_key", pa.string()),
67
+ ("captions", pa.list_(pa.string())), # Changed from single caption to list
68
+ ("caption_count", pa.int32()), # Number of captions for this item
69
+ ("contributor_id", pa.string()),
70
+ ("timestamp", pa.timestamp("us")),
71
+ ("quality_scores", pa.list_(pa.float32())), # Optional quality scores per caption
72
+ ("image_width", pa.int32()),
73
+ ("image_height", pa.int32()),
74
+ ("image_format", pa.string()),
75
+ ("file_size", pa.int64()),
76
+ ("processing_time_ms", pa.float32()),
77
+ ]
78
+ )
79
+
80
+ self.job_schema = pa.schema(
81
+ [
82
+ ("job_id", pa.string()),
83
+ ("dataset", pa.string()),
84
+ ("shard", pa.string()),
85
+ ("item_key", pa.string()),
86
+ ("status", pa.string()),
87
+ ("assigned_to", pa.string()),
88
+ ("created_at", pa.timestamp("us")),
89
+ ("updated_at", pa.timestamp("us")),
90
+ ]
91
+ )
92
+
93
+ self.contributor_schema = pa.schema(
94
+ [
95
+ ("contributor_id", pa.string()),
96
+ ("name", pa.string()),
97
+ ("total_captions", pa.int64()),
98
+ ("trust_level", pa.int32()),
99
+ ]
100
+ )
101
+
102
+ async def initialize(self):
103
+ """Initialize storage files if they don't exist."""
104
+ # Create empty parquet files if needed
105
+ if not self.captions_path.exists():
106
+ # Create empty table with schema using from_pydict
107
+ empty_dict = {
108
+ "job_id": [],
109
+ "dataset": [],
110
+ "shard": [],
111
+ "chunk_id": [],
112
+ "item_key": [],
113
+ "captions": [],
114
+ "caption_count": [],
115
+ "contributor_id": [],
116
+ "timestamp": [],
117
+ "quality_scores": [],
118
+ "image_width": [],
119
+ "image_height": [],
120
+ "image_format": [],
121
+ "file_size": [],
122
+ "processing_time_ms": [],
123
+ }
124
+ empty_table = pa.Table.from_pydict(empty_dict, schema=self.caption_schema)
125
+ pq.write_table(empty_table, self.captions_path)
126
+ logger.info(f"Created empty caption storage at {self.captions_path}")
127
+ else:
128
+ # Load existing caption job_ids to prevent duplicates
129
+ existing_captions = pq.read_table(self.captions_path, columns=["job_id"])
130
+ self.existing_caption_job_ids = set(existing_captions["job_id"].to_pylist())
131
+ logger.info(f"Loaded {len(self.existing_caption_job_ids)} existing caption job_ids")
132
+
133
+ if not self.jobs_path.exists():
134
+ # Create empty table with schema using from_pydict
135
+ empty_dict = {
136
+ "job_id": [],
137
+ "dataset": [],
138
+ "shard": [],
139
+ "item_key": [],
140
+ "status": [],
141
+ "assigned_to": [],
142
+ "created_at": [],
143
+ "updated_at": [],
144
+ }
145
+ empty_table = pa.Table.from_pydict(empty_dict, schema=self.job_schema)
146
+ pq.write_table(empty_table, self.jobs_path)
147
+ logger.info(f"Created empty job storage at {self.jobs_path}")
148
+ else:
149
+ # Load existing job_ids
150
+ existing_jobs = pq.read_table(self.jobs_path, columns=["job_id"])
151
+ self.existing_job_ids = set(existing_jobs["job_id"].to_pylist())
152
+ logger.info(f"Loaded {len(self.existing_job_ids)} existing job_ids")
153
+
154
+ if not self.contributors_path.exists():
155
+ # Create empty table with schema using from_pydict
156
+ empty_dict = {"contributor_id": [], "name": [], "total_captions": [], "trust_level": []}
157
+ empty_table = pa.Table.from_pydict(empty_dict, schema=self.contributor_schema)
158
+ pq.write_table(empty_table, self.contributors_path)
159
+ logger.info(f"Created empty contributor storage at {self.contributors_path}")
160
+ else:
161
+ # Load existing contributors
162
+ existing_contributors = pq.read_table(
163
+ self.contributors_path, columns=["contributor_id"]
164
+ )
165
+ self.existing_contributor_ids = set(existing_contributors["contributor_id"].to_pylist())
166
+ logger.info(f"Loaded {len(self.existing_contributor_ids)} existing contributor IDs")
167
+
168
+ async def save_captions(self, caption_data: Dict[str, Any]):
169
+ """Save captions for an image - single row with list of captions."""
170
+ job_id = caption_data["job_id"]
171
+
172
+ # Check if we already have captions for this job_id
173
+ if job_id in self.existing_caption_job_ids:
174
+ self.duplicates_skipped += 1
175
+ logger.debug(f"Skipping duplicate captions for job_id: {job_id}")
176
+ return
177
+
178
+ # Check if it's already in the buffer
179
+ for buffered in self.caption_buffer:
180
+ if buffered["job_id"] == job_id:
181
+ logger.debug(f"Captions for job_id {job_id} already in buffer")
182
+ return
183
+
184
+ # Ensure captions is a list (not a JSON string)
185
+ captions = caption_data.get("captions")
186
+ if isinstance(captions, str):
187
+ # If it's a JSON string, decode it
188
+ import json
189
+
190
+ try:
191
+ captions = json.loads(captions)
192
+ caption_data["captions"] = captions
193
+ logger.warning(f"Decoded JSON string to list for job_id {job_id}")
194
+ except json.JSONDecodeError:
195
+ logger.error(f"Invalid captions format for job_id {job_id}")
196
+ return
197
+
198
+ if not isinstance(captions, list):
199
+ logger.error(f"Captions must be a list for job_id {job_id}, got {type(captions)}")
200
+ return
201
+
202
+ # Add caption count
203
+ caption_data["caption_count"] = len(captions)
204
+
205
+ # Add default values for optional fields if not present
206
+ if "quality_scores" not in caption_data:
207
+ caption_data["quality_scores"] = None
208
+
209
+ self.caption_buffer.append(caption_data)
210
+ self.existing_caption_job_ids.add(job_id)
211
+
212
+ # Log buffer status
213
+ logger.debug(f"Caption buffer size: {len(self.caption_buffer)}/{self.caption_buffer_size}")
214
+ logger.debug(f" Added captions for {job_id}: {len(captions)} captions")
215
+
216
+ # Flush if buffer is large enough
217
+ if len(self.caption_buffer) >= self.caption_buffer_size:
218
+ await self._flush_captions()
219
+
220
+ async def save_caption(self, caption: Caption):
221
+ """Save a single caption entry."""
222
+ # Convert to dict and ensure it's a list of captions
223
+ caption_dict = asdict(caption)
224
+ if "captions" in caption_dict and not isinstance(caption_dict["captions"], list):
225
+ caption_dict["captions"] = [caption_dict["captions"]]
226
+ elif "caption" in caption_dict and isinstance(caption_dict["caption"], str):
227
+ # If it's a single caption string, wrap it in a list
228
+ caption_dict["captions"] = [caption_dict["caption"]]
229
+ del caption_dict["caption"]
230
+
231
+ # Add to buffer
232
+ self.caption_buffer.append(caption_dict)
233
+
234
+ # Log buffer status
235
+ logger.debug(f"Caption buffer size: {len(self.caption_buffer)}/{self.caption_buffer_size}")
236
+
237
+ # Flush if buffer is large enough
238
+ if len(self.caption_buffer) >= self.caption_buffer_size:
239
+ await self._flush_captions()
240
+
241
+ async def save_job(self, job: Job):
242
+ """Save or update a job - buffers until batch size reached."""
243
+ # For updates, we still add to buffer (will be handled in flush)
244
+ self.job_buffer.append(
245
+ {
246
+ "job_id": job.job_id,
247
+ "dataset": job.dataset,
248
+ "shard": job.shard,
249
+ "item_key": job.item_key,
250
+ "status": job.status.value,
251
+ "assigned_to": job.assigned_to,
252
+ "created_at": job.created_at,
253
+ "updated_at": datetime.utcnow(),
254
+ }
255
+ )
256
+
257
+ self.existing_job_ids.add(job.job_id)
258
+
259
+ if len(self.job_buffer) >= self.job_buffer_size:
260
+ await self._flush_jobs()
261
+
262
+ async def save_contributor(self, contributor: Contributor):
263
+ """Save or update contributor stats - buffers until batch size reached."""
264
+ self.contributor_buffer.append(asdict(contributor))
265
+
266
+ if len(self.contributor_buffer) >= self.contributor_buffer_size:
267
+ await self._flush_contributors()
268
+
269
+ async def _flush_captions(self):
270
+ """Write caption buffer to parquet with deduplication."""
271
+ if not self.caption_buffer:
272
+ return
273
+
274
+ num_rows = len(self.caption_buffer)
275
+ num_captions = sum(len(row["captions"]) for row in self.caption_buffer)
276
+ logger.info(f"Flushing {num_rows} rows with {num_captions} total captions to disk")
277
+
278
+ # Ensure all captions are proper lists before creating table
279
+ for row in self.caption_buffer:
280
+ if isinstance(row["captions"], str):
281
+ import json
282
+
283
+ try:
284
+ row["captions"] = json.loads(row["captions"])
285
+ except:
286
+ logger.error(f"Failed to decode captions for {row['job_id']}")
287
+ row["captions"] = [row["captions"]] # Wrap string in list as fallback
288
+
289
+ # Create table from buffer with explicit schema
290
+ table = pa.Table.from_pylist(self.caption_buffer, schema=self.caption_schema)
291
+
292
+ if self.captions_path.exists():
293
+ # Read existing table
294
+ existing = pq.read_table(self.captions_path)
295
+
296
+ # Get existing job_ids for deduplication
297
+ existing_job_ids = set(existing.column("job_id").to_pylist())
298
+
299
+ # Filter new data to exclude duplicates
300
+ new_rows = []
301
+ for row in self.caption_buffer:
302
+ if row["job_id"] not in existing_job_ids:
303
+ new_rows.append(row)
304
+
305
+ if new_rows:
306
+ # Create table from new rows only
307
+ new_table = pa.Table.from_pylist(new_rows, schema=self.caption_schema)
308
+
309
+ # Combine tables using PyArrow concat (preserves list types better)
310
+ combined = pa.concat_tables([existing, new_table])
311
+
312
+ # Write with proper list column preservation
313
+ pq.write_table(combined, self.captions_path, compression="snappy")
314
+
315
+ logger.info(
316
+ f"Added {len(new_rows)} new rows (skipped {num_rows - len(new_rows)} duplicates)"
317
+ )
318
+ actual_new = len(new_rows)
319
+ else:
320
+ logger.info(f"All {num_rows} rows were duplicates, skipping write")
321
+ actual_new = 0
322
+ else:
323
+ # Write new file with proper list columns
324
+ pq.write_table(table, self.captions_path, compression="snappy")
325
+ actual_new = num_rows
326
+
327
+ self.total_captions_written += actual_new
328
+ self.total_caption_entries_written += sum(
329
+ len(row["captions"]) for row in self.caption_buffer[:actual_new]
330
+ )
331
+ self.total_flushes += 1
332
+ self.caption_buffer.clear()
333
+
334
+ logger.info(
335
+ f"Successfully wrote captions (rows: {self.total_captions_written}, "
336
+ f"total captions: {self.total_caption_entries_written}, "
337
+ f"duplicates skipped: {self.duplicates_skipped})"
338
+ )
339
+
340
+ async def _flush_jobs(self):
341
+ """Write job buffer to parquet."""
342
+ if not self.job_buffer:
343
+ return
344
+
345
+ table = pa.Table.from_pylist(self.job_buffer, schema=self.job_schema)
346
+
347
+ # For jobs, we need to handle updates (upsert logic)
348
+ if self.jobs_path.exists():
349
+ existing = pq.read_table(self.jobs_path).to_pandas()
350
+ new_df = table.to_pandas()
351
+
352
+ # Update existing records or add new ones
353
+ for _, row in new_df.iterrows():
354
+ mask = existing["job_id"] == row["job_id"]
355
+ if mask.any():
356
+ # Update existing
357
+ for col in row.index:
358
+ existing.loc[existing[mask].index, col] = row[col]
359
+ else:
360
+ # Add new
361
+ existing = pd.concat([existing, pd.DataFrame([row])], ignore_index=True)
362
+
363
+ updated_table = pa.Table.from_pandas(existing, schema=self.job_schema)
364
+ pq.write_table(updated_table, self.jobs_path)
365
+ else:
366
+ pq.write_table(table, self.jobs_path)
367
+
368
+ self.job_buffer.clear()
369
+ logger.debug(f"Flushed {len(self.job_buffer)} jobs")
370
+
371
+ async def _flush_contributors(self):
372
+ """Write contributor buffer to parquet."""
373
+ if not self.contributor_buffer:
374
+ return
375
+
376
+ table = pa.Table.from_pylist(self.contributor_buffer, schema=self.contributor_schema)
377
+
378
+ # Handle updates for contributors
379
+ if self.contributors_path.exists():
380
+ existing = pq.read_table(self.contributors_path).to_pandas()
381
+ new_df = table.to_pandas()
382
+
383
+ for _, row in new_df.iterrows():
384
+ mask = existing["contributor_id"] == row["contributor_id"]
385
+ if mask.any():
386
+ for col in row.index:
387
+ existing.loc[mask, col] = row[col]
388
+ else:
389
+ existing = pd.concat([existing, pd.DataFrame([row])], ignore_index=True)
390
+
391
+ updated_table = pa.Table.from_pandas(existing, schema=self.contributor_schema)
392
+ pq.write_table(updated_table, self.contributors_path)
393
+ else:
394
+ pq.write_table(table, self.contributors_path)
395
+
396
+ self.contributor_buffer.clear()
397
+
398
+ async def checkpoint(self):
399
+ """Force flush all buffers to disk - called periodically by orchestrator."""
400
+ logger.info(
401
+ f"Checkpoint: Flushing buffers (captions: {len(self.caption_buffer)}, "
402
+ f"jobs: {len(self.job_buffer)}, contributors: {len(self.contributor_buffer)})"
403
+ )
404
+
405
+ await self._flush_captions()
406
+ await self._flush_jobs()
407
+ await self._flush_contributors()
408
+
409
+ logger.info(
410
+ f"Checkpoint complete. Total rows: {self.total_captions_written}, "
411
+ f"Total caption entries: {self.total_caption_entries_written}, "
412
+ f"Duplicates skipped: {self.duplicates_skipped}"
413
+ )
414
+
415
+ async def job_exists(self, job_id: str) -> bool:
416
+ """Check if a job already exists in storage or buffer."""
417
+ if job_id in self.existing_job_ids:
418
+ return True
419
+
420
+ # Check buffer
421
+ for buffered in self.job_buffer:
422
+ if buffered["job_id"] == job_id:
423
+ return True
424
+
425
+ return False
426
+
427
+ async def get_captions(self, job_id: str) -> Optional[List[str]]:
428
+ """Retrieve captions for a specific job_id."""
429
+ # Check buffer first
430
+ for buffered in self.caption_buffer:
431
+ if buffered["job_id"] == job_id:
432
+ return buffered["captions"]
433
+
434
+ if not self.captions_path.exists():
435
+ return None
436
+
437
+ table = pq.read_table(self.captions_path)
438
+ df = table.to_pandas()
439
+
440
+ row = df[df["job_id"] == job_id]
441
+ if row.empty:
442
+ return None
443
+
444
+ captions = row.iloc[0]["captions"]
445
+
446
+ # Handle both correct list storage and incorrect JSON string storage
447
+ if isinstance(captions, str):
448
+ # This shouldn't happen with correct storage, but handle legacy data
449
+ try:
450
+ captions = json.loads(captions)
451
+ logger.warning(f"Had to decode JSON string for job_id {job_id} - file needs fixing")
452
+ except json.JSONDecodeError:
453
+ captions = [captions] # Wrap single string as list
454
+
455
+ return captions
456
+
457
+ async def get_job(self, job_id: str) -> Optional[Job]:
458
+ """Retrieve a job by ID."""
459
+ # Check buffer first
460
+ for buffered in self.job_buffer:
461
+ if buffered["job_id"] == job_id:
462
+ return Job(
463
+ job_id=buffered["job_id"],
464
+ dataset=buffered["dataset"],
465
+ shard=buffered["shard"],
466
+ item_key=buffered["item_key"],
467
+ status=JobStatus(buffered["status"]),
468
+ assigned_to=buffered["assigned_to"],
469
+ created_at=buffered["created_at"],
470
+ )
471
+
472
+ if not self.jobs_path.exists():
473
+ return None
474
+
475
+ table = pq.read_table(self.jobs_path)
476
+ df = table.to_pandas()
477
+
478
+ row = df[df["job_id"] == job_id]
479
+ if row.empty:
480
+ return None
481
+
482
+ return Job(
483
+ job_id=row.iloc[0]["job_id"],
484
+ dataset=row.iloc[0]["dataset"],
485
+ shard=row.iloc[0]["shard"],
486
+ item_key=row.iloc[0]["item_key"],
487
+ status=JobStatus(row.iloc[0]["status"]),
488
+ assigned_to=row.iloc[0]["assigned_to"],
489
+ created_at=row.iloc[0]["created_at"],
490
+ )
491
+
492
+ async def get_jobs_by_worker(self, worker_id: str) -> List[Job]:
493
+ """Get all jobs assigned to a worker."""
494
+ if not self.jobs_path.exists():
495
+ return []
496
+
497
+ table = pq.read_table(self.jobs_path)
498
+ df = table.to_pandas()
499
+
500
+ rows = df[df["assigned_to"] == worker_id]
501
+
502
+ jobs = []
503
+ for _, row in rows.iterrows():
504
+ jobs.append(
505
+ Job(
506
+ job_id=row["job_id"],
507
+ dataset=row["dataset"],
508
+ shard=row["shard"],
509
+ item_key=row["item_key"],
510
+ status=JobStatus(row["status"]),
511
+ assigned_to=row["assigned_to"],
512
+ created_at=row["created_at"],
513
+ )
514
+ )
515
+
516
+ return jobs
517
+
518
+ async def get_caption_stats(self) -> Dict[str, Any]:
519
+ """Get statistics about stored captions."""
520
+ if not self.captions_path.exists():
521
+ return {
522
+ "total_rows": 0,
523
+ "total_captions": 0,
524
+ "avg_captions_per_image": 0,
525
+ "min_captions": 0,
526
+ "max_captions": 0,
527
+ }
528
+
529
+ table = pq.read_table(self.captions_path)
530
+ df = table.to_pandas()
531
+
532
+ if len(df) == 0:
533
+ return {
534
+ "total_rows": 0,
535
+ "total_captions": 0,
536
+ "avg_captions_per_image": 0,
537
+ "min_captions": 0,
538
+ "max_captions": 0,
539
+ }
540
+
541
+ caption_counts = df["caption_count"].values
542
+
543
+ return {
544
+ "total_rows": len(df),
545
+ "total_captions": caption_counts.sum(),
546
+ "avg_captions_per_image": caption_counts.mean(),
547
+ "min_captions": caption_counts.min(),
548
+ "max_captions": caption_counts.max(),
549
+ "std_captions": caption_counts.std(),
550
+ }
551
+
552
+ async def get_sample_captions(self, n: int = 5) -> List[Dict[str, Any]]:
553
+ """Get a sample of caption entries for inspection."""
554
+ if not self.captions_path.exists():
555
+ return []
556
+
557
+ table = pq.read_table(self.captions_path)
558
+ df = table.to_pandas()
559
+
560
+ if len(df) == 0:
561
+ return []
562
+
563
+ sample_df = df.sample(min(n, len(df)))
564
+ samples = []
565
+
566
+ for _, row in sample_df.iterrows():
567
+ samples.append(
568
+ {
569
+ "job_id": row["job_id"],
570
+ "item_key": row["item_key"],
571
+ "captions": row["captions"],
572
+ "caption_count": row["caption_count"],
573
+ "image_dims": f"{row.get('image_width', 'N/A')}x{row.get('image_height', 'N/A')}",
574
+ }
575
+ )
576
+
577
+ return samples
578
+
579
+ async def count_captions(self) -> int:
580
+ """Count total caption entries (not rows)."""
581
+ if not self.captions_path.exists():
582
+ return 0
583
+
584
+ table = pq.read_table(self.captions_path, columns=["caption_count"])
585
+ df = table.to_pandas()
586
+ return df["caption_count"].sum()
587
+
588
+ async def count_caption_rows(self) -> int:
589
+ """Count total rows (unique images with captions)."""
590
+ if not self.captions_path.exists():
591
+ return 0
592
+
593
+ table = pq.read_table(self.captions_path)
594
+ return len(table)
595
+
596
+ async def get_contributor(self, contributor_id: str) -> Optional[Contributor]:
597
+ """Retrieve a contributor by ID."""
598
+ # Check buffer first
599
+ for buffered in self.contributor_buffer:
600
+ if buffered["contributor_id"] == contributor_id:
601
+ return Contributor(**buffered)
602
+
603
+ if not self.contributors_path.exists():
604
+ return None
605
+
606
+ table = pq.read_table(self.contributors_path)
607
+ df = table.to_pandas()
608
+
609
+ row = df[df["contributor_id"] == contributor_id]
610
+ if row.empty:
611
+ return None
612
+
613
+ return Contributor(
614
+ contributor_id=row.iloc[0]["contributor_id"],
615
+ name=row.iloc[0]["name"],
616
+ total_captions=int(row.iloc[0]["total_captions"]),
617
+ trust_level=int(row.iloc[0]["trust_level"]),
618
+ )
619
+
620
+ async def get_top_contributors(self, limit: int = 10) -> List[Contributor]:
621
+ """Get top contributors by caption count."""
622
+ contributors = []
623
+
624
+ if self.contributors_path.exists():
625
+ table = pq.read_table(self.contributors_path)
626
+ df = table.to_pandas()
627
+
628
+ # Sort by total_captions descending
629
+ df = df.sort_values("total_captions", ascending=False).head(limit)
630
+
631
+ for _, row in df.iterrows():
632
+ contributors.append(
633
+ Contributor(
634
+ contributor_id=row["contributor_id"],
635
+ name=row["name"],
636
+ total_captions=int(row["total_captions"]),
637
+ trust_level=int(row["trust_level"]),
638
+ )
639
+ )
640
+
641
+ return contributors
642
+
643
+ async def get_pending_jobs(self) -> List[Job]:
644
+ """Get all pending jobs for restoration on startup."""
645
+ if not self.jobs_path.exists():
646
+ return []
647
+
648
+ table = pq.read_table(self.jobs_path)
649
+ df = table.to_pandas()
650
+
651
+ # Get jobs with PENDING or PROCESSING status
652
+ pending_df = df[df["status"].isin([JobStatus.PENDING.value, JobStatus.PROCESSING.value])]
653
+
654
+ jobs = []
655
+ for _, row in pending_df.iterrows():
656
+ jobs.append(
657
+ Job(
658
+ job_id=row["job_id"],
659
+ dataset=row["dataset"],
660
+ shard=row["shard"],
661
+ item_key=row["item_key"],
662
+ status=JobStatus(row["status"]),
663
+ assigned_to=row.get("assigned_to"),
664
+ created_at=row["created_at"],
665
+ )
666
+ )
667
+
668
+ return jobs
669
+
670
+ async def count_jobs(self) -> int:
671
+ """Count total jobs."""
672
+ if not self.jobs_path.exists():
673
+ return 0
674
+
675
+ table = pq.read_table(self.jobs_path)
676
+ return len(table)
677
+
678
+ async def count_completed_jobs(self) -> int:
679
+ """Count completed jobs."""
680
+ if not self.jobs_path.exists():
681
+ return 0
682
+
683
+ table = pq.read_table(self.jobs_path)
684
+ df = table.to_pandas()
685
+ return len(df[df["status"] == JobStatus.COMPLETED.value])
686
+
687
+ async def close(self):
688
+ """Close storage and flush buffers."""
689
+ await self.checkpoint()
690
+ logger.info(
691
+ f"Storage closed. Total rows: {self.total_captions_written}, "
692
+ f"Total caption entries: {self.total_caption_entries_written}, "
693
+ f"Duplicates skipped: {self.duplicates_skipped}"
694
+ )
@@ -0,0 +1,4 @@
1
+ """Utility modules for CaptionFlow."""
2
+
3
+ from .dataset_loader import DatasetLoader, ShardTracker
4
+ from .caption_utils import CaptionUtils