caption-flow 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- caption_flow/cli.py +2 -1
- caption_flow/models.py +108 -1
- caption_flow/monitor.py +1 -1
- caption_flow/orchestrator.py +423 -1595
- caption_flow/processors/__init__.py +11 -0
- caption_flow/processors/base.py +219 -0
- caption_flow/processors/huggingface.py +832 -0
- caption_flow/processors/local_filesystem.py +683 -0
- caption_flow/processors/webdataset.py +782 -0
- caption_flow/storage.py +415 -406
- caption_flow/utils/checkpoint_tracker.py +2 -2
- caption_flow/utils/chunk_tracker.py +94 -35
- caption_flow/utils/dataset_loader.py +64 -522
- caption_flow/utils/dataset_metadata_cache.py +67 -0
- caption_flow/utils/image_processor.py +1 -4
- caption_flow/utils/shard_processor.py +4 -200
- caption_flow/utils/shard_tracker.py +1 -5
- caption_flow/workers/base.py +3 -3
- caption_flow/workers/caption.py +416 -792
- {caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/METADATA +29 -27
- caption_flow-0.2.3.dist-info/RECORD +35 -0
- caption_flow-0.2.1.dist-info/RECORD +0 -29
- {caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/WHEEL +0 -0
- {caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/entry_points.txt +0 -0
- {caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/top_level.txt +0 -0
caption_flow/storage.py
CHANGED
@@ -4,18 +4,21 @@ import asyncio
|
|
4
4
|
import json
|
5
5
|
import logging
|
6
6
|
from dataclasses import asdict
|
7
|
-
from datetime import datetime
|
7
|
+
from datetime import datetime, timedelta
|
8
8
|
from pathlib import Path
|
9
9
|
from typing import List, Optional, Set, Dict, Any
|
10
10
|
import pyarrow as pa
|
11
11
|
import pyarrow.parquet as pq
|
12
12
|
from pyarrow import fs
|
13
13
|
import pandas as pd
|
14
|
-
from collections import defaultdict
|
14
|
+
from collections import defaultdict, deque
|
15
|
+
import time
|
16
|
+
import numpy as np
|
15
17
|
|
16
|
-
from .models import Job, Caption, Contributor, JobStatus
|
18
|
+
from .models import Job, Caption, Contributor, JobStatus, JobId
|
17
19
|
|
18
20
|
logger = logging.getLogger(__name__)
|
21
|
+
logger.setLevel(logging.INFO)
|
19
22
|
|
20
23
|
|
21
24
|
class StorageManager:
|
@@ -60,6 +63,11 @@ class StorageManager:
|
|
60
63
|
self.total_flushes = 0
|
61
64
|
self.duplicates_skipped = 0
|
62
65
|
|
66
|
+
# Rate tracking
|
67
|
+
self.row_additions = deque(maxlen=10000) # Store (timestamp, row_count) tuples
|
68
|
+
self.start_time = time.time()
|
69
|
+
self.last_rate_log_time = time.time()
|
70
|
+
|
63
71
|
# Base caption schema without dynamic output fields
|
64
72
|
self.base_caption_fields = [
|
65
73
|
("job_id", pa.string()),
|
@@ -68,6 +76,8 @@ class StorageManager:
|
|
68
76
|
("chunk_id", pa.string()),
|
69
77
|
("item_key", pa.string()),
|
70
78
|
("item_index", pa.int32()),
|
79
|
+
("filename", pa.string()),
|
80
|
+
("url", pa.string()),
|
71
81
|
("caption_count", pa.int32()),
|
72
82
|
("contributor_id", pa.string()),
|
73
83
|
("timestamp", pa.timestamp("us")),
|
@@ -105,6 +115,137 @@ class StorageManager:
|
|
105
115
|
]
|
106
116
|
)
|
107
117
|
|
118
|
+
def _is_column_empty(self, df: pd.DataFrame, column_name: str) -> bool:
|
119
|
+
"""Check if a column is entirely empty, null, or contains only zeros/empty lists."""
|
120
|
+
if column_name not in df.columns:
|
121
|
+
return True
|
122
|
+
|
123
|
+
col = df[column_name]
|
124
|
+
|
125
|
+
# Check if all values are null/NaN
|
126
|
+
if col.isna().all():
|
127
|
+
return True
|
128
|
+
|
129
|
+
# For numeric columns, check if all non-null values are 0
|
130
|
+
if pd.api.types.is_numeric_dtype(col):
|
131
|
+
non_null_values = col.dropna()
|
132
|
+
if len(non_null_values) > 0 and (non_null_values == 0).all():
|
133
|
+
return True
|
134
|
+
|
135
|
+
# For list columns, check if all are None or empty lists
|
136
|
+
if col.dtype == "object":
|
137
|
+
non_null_values = col.dropna()
|
138
|
+
if len(non_null_values) == 0:
|
139
|
+
return True
|
140
|
+
# Check if all non-null values are empty lists
|
141
|
+
all_empty_lists = True
|
142
|
+
for val in non_null_values:
|
143
|
+
if isinstance(val, list) and len(val) > 0:
|
144
|
+
all_empty_lists = False
|
145
|
+
break
|
146
|
+
elif not isinstance(val, list):
|
147
|
+
all_empty_lists = False
|
148
|
+
break
|
149
|
+
if all_empty_lists:
|
150
|
+
return True
|
151
|
+
|
152
|
+
return False
|
153
|
+
|
154
|
+
def _get_non_empty_columns(
|
155
|
+
self, df: pd.DataFrame, preserve_base_fields: bool = True
|
156
|
+
) -> List[str]:
|
157
|
+
"""Get list of columns that contain actual data.
|
158
|
+
|
159
|
+
Args:
|
160
|
+
df: DataFrame to check
|
161
|
+
preserve_base_fields: If True, always include base fields even if empty
|
162
|
+
"""
|
163
|
+
base_field_names = {field[0] for field in self.base_caption_fields}
|
164
|
+
non_empty_columns = []
|
165
|
+
|
166
|
+
for col in df.columns:
|
167
|
+
# Always keep base fields if preserve_base_fields is True
|
168
|
+
if preserve_base_fields and col in base_field_names:
|
169
|
+
non_empty_columns.append(col)
|
170
|
+
elif not self._is_column_empty(df, col):
|
171
|
+
non_empty_columns.append(col)
|
172
|
+
|
173
|
+
return non_empty_columns
|
174
|
+
|
175
|
+
def _calculate_rates(self) -> Dict[str, float]:
|
176
|
+
"""Calculate row addition rates over different time windows."""
|
177
|
+
current_time = time.time()
|
178
|
+
rates = {}
|
179
|
+
|
180
|
+
# Define time windows in minutes
|
181
|
+
windows = {"1min": 1, "5min": 5, "15min": 15, "60min": 60}
|
182
|
+
|
183
|
+
# Clean up old entries beyond the largest window
|
184
|
+
cutoff_time = current_time - (60 * 60) # 60 minutes
|
185
|
+
while self.row_additions and self.row_additions[0][0] < cutoff_time:
|
186
|
+
self.row_additions.popleft()
|
187
|
+
|
188
|
+
# Calculate rates for each window
|
189
|
+
for window_name, window_minutes in windows.items():
|
190
|
+
window_seconds = window_minutes * 60
|
191
|
+
window_start = current_time - window_seconds
|
192
|
+
|
193
|
+
# Sum rows added within this window
|
194
|
+
rows_in_window = sum(
|
195
|
+
count for timestamp, count in self.row_additions if timestamp >= window_start
|
196
|
+
)
|
197
|
+
|
198
|
+
# Calculate rate (rows per second)
|
199
|
+
# For windows larger than elapsed time, use elapsed time
|
200
|
+
elapsed = current_time - self.start_time
|
201
|
+
actual_window = min(window_seconds, elapsed)
|
202
|
+
|
203
|
+
if actual_window > 0:
|
204
|
+
rate = rows_in_window / actual_window
|
205
|
+
rates[window_name] = rate
|
206
|
+
else:
|
207
|
+
rates[window_name] = 0.0
|
208
|
+
|
209
|
+
# Calculate instantaneous rate (last minute)
|
210
|
+
instant_window_start = current_time - 60 # Last 60 seconds
|
211
|
+
instant_rows = sum(
|
212
|
+
count for timestamp, count in self.row_additions if timestamp >= instant_window_start
|
213
|
+
)
|
214
|
+
instant_window = min(60, current_time - self.start_time)
|
215
|
+
rates["instant"] = instant_rows / instant_window if instant_window > 0 else 0.0
|
216
|
+
|
217
|
+
# Calculate overall rate since start
|
218
|
+
total_elapsed = current_time - self.start_time
|
219
|
+
if total_elapsed > 0:
|
220
|
+
rates["overall"] = self.total_captions_written / total_elapsed
|
221
|
+
else:
|
222
|
+
rates["overall"] = 0.0
|
223
|
+
|
224
|
+
return rates
|
225
|
+
|
226
|
+
def _log_rates(self, rows_added: int):
|
227
|
+
"""Log rate information if enough time has passed."""
|
228
|
+
current_time = time.time()
|
229
|
+
|
230
|
+
# Log rates every 10 seconds or if it's been more than 30 seconds
|
231
|
+
time_since_last_log = current_time - self.last_rate_log_time
|
232
|
+
if time_since_last_log < 10 and rows_added < 50:
|
233
|
+
return
|
234
|
+
|
235
|
+
rates = self._calculate_rates()
|
236
|
+
|
237
|
+
# Format the rate information
|
238
|
+
rate_str = (
|
239
|
+
f"Rate stats - Instant: {rates['instant']:.1f} rows/s | "
|
240
|
+
f"Avg (5m): {rates['5min']:.1f} | "
|
241
|
+
f"Avg (15m): {rates['15min']:.1f} | "
|
242
|
+
f"Avg (60m): {rates['60min']:.1f} | "
|
243
|
+
f"Overall: {rates['overall']:.1f} rows/s"
|
244
|
+
)
|
245
|
+
|
246
|
+
logger.info(rate_str)
|
247
|
+
self.last_rate_log_time = current_time
|
248
|
+
|
108
249
|
def _get_existing_output_columns(self) -> Set[str]:
|
109
250
|
"""Get output field columns that actually exist in the parquet file."""
|
110
251
|
if not self.captions_path.exists():
|
@@ -216,9 +357,14 @@ class StorageManager:
|
|
216
357
|
if "outputs" in df.columns:
|
217
358
|
df = df.drop(columns=["outputs"])
|
218
359
|
|
219
|
-
#
|
220
|
-
self.
|
221
|
-
|
360
|
+
# Remove empty columns before saving (but preserve base fields)
|
361
|
+
non_empty_columns = self._get_non_empty_columns(df, preserve_base_fields=True)
|
362
|
+
df = df[non_empty_columns]
|
363
|
+
|
364
|
+
# Update known fields and schema based on non-empty columns
|
365
|
+
base_field_names = {field[0] for field in self.base_caption_fields}
|
366
|
+
self.known_output_fields = set(non_empty_columns) - base_field_names
|
367
|
+
self.caption_schema = self._build_caption_schema(self.known_output_fields)
|
222
368
|
|
223
369
|
# Write migrated table
|
224
370
|
migrated_table = pa.Table.from_pandas(df, schema=self.caption_schema)
|
@@ -226,8 +372,7 @@ class StorageManager:
|
|
226
372
|
logger.info("Migration complete - outputs now stored in dynamic columns")
|
227
373
|
|
228
374
|
async def save_caption(self, caption: Caption):
|
229
|
-
"""Save a caption entry
|
230
|
-
# Convert to dict
|
375
|
+
"""Save a caption entry, grouping outputs by job_id/item_key (not separating captions)."""
|
231
376
|
caption_dict = asdict(caption)
|
232
377
|
|
233
378
|
# Extract item_index from metadata if present
|
@@ -242,16 +387,61 @@ class StorageManager:
|
|
242
387
|
# Remove old "captions" field if it exists (will be in outputs)
|
243
388
|
caption_dict.pop("captions", None)
|
244
389
|
|
245
|
-
|
390
|
+
# Grouping key: (job_id, item_key)
|
391
|
+
_job_id = caption_dict.get("job_id")
|
392
|
+
job_id = JobId.from_dict(_job_id).get_sample_str()
|
393
|
+
group_key = job_id
|
394
|
+
logger.debug(
|
395
|
+
f"save_caption: group_key={group_key}, outputs={list(outputs.keys())}, caption_count={caption_dict.get('caption_count')}, item_index={caption_dict.get('item_index')}"
|
396
|
+
)
|
397
|
+
|
398
|
+
# Try to find existing buffered row for this group
|
399
|
+
found_row = False
|
400
|
+
for idx, row in enumerate(self.caption_buffer):
|
401
|
+
check_key = row.get("job_id")
|
402
|
+
logger.debug(f"Checking buffer row {idx}: check_key={check_key}, group_key={group_key}")
|
403
|
+
if check_key == group_key:
|
404
|
+
found_row = True
|
405
|
+
logger.debug(f"Found existing buffer row for group_key={group_key} at index {idx}")
|
406
|
+
# Merge outputs into existing row
|
407
|
+
for field_name, field_values in outputs.items():
|
408
|
+
if field_name not in self.known_output_fields:
|
409
|
+
self.known_output_fields.add(field_name)
|
410
|
+
logger.info(f"New output field detected: {field_name}")
|
411
|
+
if field_name in row and isinstance(row[field_name], list):
|
412
|
+
logger.debug(
|
413
|
+
f"Merging output field '{field_name}' into existing row: before={row[field_name]}, adding={field_values}"
|
414
|
+
)
|
415
|
+
row[field_name].extend(field_values)
|
416
|
+
logger.debug(f"After merge: {row[field_name]}")
|
417
|
+
else:
|
418
|
+
logger.debug(
|
419
|
+
f"Setting new output field '{field_name}' in existing row: {field_values}"
|
420
|
+
)
|
421
|
+
row[field_name] = list(field_values)
|
422
|
+
# Optionally update other fields (e.g., caption_count)
|
423
|
+
if "caption_count" in caption_dict:
|
424
|
+
old_count = row.get("caption_count", 0)
|
425
|
+
row["caption_count"] = old_count + caption_dict["caption_count"]
|
426
|
+
logger.debug(
|
427
|
+
f"Updated caption_count for group_key={group_key}: {old_count} + {caption_dict['caption_count']} = {row['caption_count']}"
|
428
|
+
)
|
429
|
+
return # Already merged, no need to add new row
|
430
|
+
else:
|
431
|
+
logger.debug(f"Caption row not found for group key: {group_key} vs {check_key}")
|
432
|
+
|
433
|
+
if not found_row:
|
434
|
+
logger.debug(
|
435
|
+
f"No existing buffer row found for group_key={group_key}, creating new row."
|
436
|
+
)
|
437
|
+
|
438
|
+
# If not found, create new row
|
246
439
|
for field_name, field_values in outputs.items():
|
247
|
-
caption_dict[field_name] = field_values
|
248
440
|
if field_name not in self.known_output_fields:
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
logger.info(f"New output fields detected: {sorted(new_fields)}")
|
254
|
-
logger.info(f"Total known output fields: {sorted(self.known_output_fields)}")
|
441
|
+
self.known_output_fields.add(field_name)
|
442
|
+
logger.info(f"New output field detected: {field_name}")
|
443
|
+
caption_dict[field_name] = list(field_values)
|
444
|
+
logger.debug(f"Adding output field '{field_name}' to new row: {field_values}")
|
255
445
|
|
256
446
|
# Serialize metadata to JSON if present
|
257
447
|
if "metadata" in caption_dict:
|
@@ -259,68 +449,16 @@ class StorageManager:
|
|
259
449
|
else:
|
260
450
|
caption_dict["metadata"] = "{}"
|
261
451
|
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
# Log buffer status
|
266
|
-
logger.debug(f"Caption buffer size: {len(self.caption_buffer)}/{self.caption_buffer_size}")
|
267
|
-
|
268
|
-
# Flush if buffer is large enough
|
269
|
-
if len(self.caption_buffer) >= self.caption_buffer_size:
|
270
|
-
await self._flush_captions()
|
271
|
-
|
272
|
-
async def save_captions(self, caption_data: Dict[str, Any]):
|
273
|
-
"""Save captions for an image - compatible with dict input."""
|
274
|
-
job_id = caption_data["job_id"]
|
275
|
-
|
276
|
-
# Check if we already have captions for this job_id
|
277
|
-
if job_id in self.existing_caption_job_ids:
|
278
|
-
self.duplicates_skipped += 1
|
279
|
-
logger.debug(f"Skipping duplicate captions for job_id: {job_id}")
|
280
|
-
return
|
281
|
-
|
282
|
-
# Check if it's already in the buffer
|
283
|
-
for buffered in self.caption_buffer:
|
284
|
-
if buffered["job_id"] == job_id:
|
285
|
-
logger.debug(f"Captions for job_id {job_id} already in buffer")
|
286
|
-
return
|
287
|
-
|
288
|
-
# Handle outputs if present
|
289
|
-
if "outputs" in caption_data:
|
290
|
-
outputs = caption_data.pop("outputs")
|
291
|
-
# Add each output field directly to caption_data
|
292
|
-
for field_name, field_values in outputs.items():
|
293
|
-
caption_data[field_name] = field_values
|
294
|
-
if field_name not in self.known_output_fields:
|
295
|
-
self.known_output_fields.add(field_name)
|
296
|
-
logger.info(f"New output field detected: {field_name}")
|
297
|
-
|
298
|
-
# Handle legacy captions field
|
299
|
-
if "captions" in caption_data and "captions" not in self.known_output_fields:
|
300
|
-
self.known_output_fields.add("captions")
|
301
|
-
|
302
|
-
# Count all outputs
|
303
|
-
caption_count = 0
|
304
|
-
for field_name in self.known_output_fields:
|
305
|
-
if field_name in caption_data and isinstance(caption_data[field_name], list):
|
306
|
-
caption_count += len(caption_data[field_name])
|
307
|
-
|
308
|
-
caption_data["caption_count"] = caption_count
|
309
|
-
|
310
|
-
# Add default values for optional fields
|
311
|
-
if "quality_scores" not in caption_data:
|
312
|
-
caption_data["quality_scores"] = None
|
452
|
+
if isinstance(caption_dict.get("job_id"), dict):
|
453
|
+
caption_dict["job_id"] = job_id
|
313
454
|
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
self.caption_buffer.append(caption_data)
|
320
|
-
self.existing_caption_job_ids.add(job_id)
|
455
|
+
self.caption_buffer.append(caption_dict)
|
456
|
+
logger.debug(
|
457
|
+
f"Appended new caption row for group_key={group_key}. Caption buffer size: {len(self.caption_buffer)}/{self.caption_buffer_size}"
|
458
|
+
)
|
321
459
|
|
322
|
-
# Flush if buffer is large enough
|
323
460
|
if len(self.caption_buffer) >= self.caption_buffer_size:
|
461
|
+
logger.debug("Caption buffer full, flushing captions.")
|
324
462
|
await self._flush_captions()
|
325
463
|
|
326
464
|
async def _flush_captions(self):
|
@@ -337,25 +475,7 @@ class StorageManager:
|
|
337
475
|
if field_name in row and isinstance(row[field_name], list):
|
338
476
|
total_outputs += len(row[field_name])
|
339
477
|
|
340
|
-
logger.
|
341
|
-
|
342
|
-
# Check if we need to evolve the schema
|
343
|
-
current_schema_fields = set(self.caption_schema.names) if self.caption_schema else set()
|
344
|
-
all_fields_needed = set(
|
345
|
-
self.base_caption_fields[i][0] for i in range(len(self.base_caption_fields))
|
346
|
-
)
|
347
|
-
all_fields_needed.update(self.known_output_fields)
|
348
|
-
|
349
|
-
if all_fields_needed != current_schema_fields:
|
350
|
-
# Schema evolution needed
|
351
|
-
logger.info(
|
352
|
-
f"Evolving schema to include new fields: {all_fields_needed - current_schema_fields}"
|
353
|
-
)
|
354
|
-
self.caption_schema = self._build_caption_schema(self.known_output_fields)
|
355
|
-
|
356
|
-
# If file exists, we need to migrate it
|
357
|
-
if self.captions_path.exists():
|
358
|
-
await self._evolve_schema_on_disk()
|
478
|
+
logger.debug(f"Flushing {num_rows} rows with {total_outputs} total outputs to disk")
|
359
479
|
|
360
480
|
# Prepare data with all required columns
|
361
481
|
prepared_buffer = []
|
@@ -374,8 +494,9 @@ class StorageManager:
|
|
374
494
|
|
375
495
|
prepared_buffer.append(prepared_row)
|
376
496
|
|
377
|
-
#
|
378
|
-
|
497
|
+
# Build schema with all known fields (base + output)
|
498
|
+
schema = self._build_caption_schema(self.known_output_fields)
|
499
|
+
table = pa.Table.from_pylist(prepared_buffer, schema=schema)
|
379
500
|
|
380
501
|
if self.captions_path.exists():
|
381
502
|
# Read existing table
|
@@ -386,45 +507,150 @@ class StorageManager:
|
|
386
507
|
|
387
508
|
# Filter new data to exclude duplicates
|
388
509
|
new_rows = []
|
510
|
+
duplicate_rows = []
|
389
511
|
for row in prepared_buffer:
|
390
512
|
if row["job_id"] not in existing_job_ids:
|
391
513
|
new_rows.append(row)
|
514
|
+
elif row not in duplicate_rows:
|
515
|
+
duplicate_rows.append(
|
516
|
+
{
|
517
|
+
"input": row,
|
518
|
+
"existing_job": existing.to_pandas()[
|
519
|
+
existing.to_pandas()["job_id"] == row["job_id"]
|
520
|
+
].to_dict(orient="records"),
|
521
|
+
}
|
522
|
+
)
|
523
|
+
|
524
|
+
if duplicate_rows:
|
525
|
+
logger.info(f"Example duplicate row: {duplicate_rows[0]}")
|
392
526
|
|
393
527
|
if new_rows:
|
394
528
|
# Create table from new rows only
|
395
|
-
new_table = pa.Table.from_pylist(new_rows, schema=
|
529
|
+
new_table = pa.Table.from_pylist(new_rows, schema=schema)
|
396
530
|
|
397
|
-
#
|
398
|
-
combined = pa.concat_tables([existing, new_table])
|
531
|
+
# Concatenate with promote_options="default" to handle schema differences automatically
|
532
|
+
combined = pa.concat_tables([existing, new_table], promote_options="default")
|
399
533
|
|
400
|
-
# Write
|
534
|
+
# Write combined table
|
401
535
|
pq.write_table(combined, self.captions_path, compression="snappy")
|
402
536
|
|
403
|
-
|
404
|
-
f"Added {len(new_rows)} new rows (skipped {num_rows - len(new_rows)} duplicates)"
|
405
|
-
)
|
537
|
+
self.duplicates_skipped = num_rows - len(new_rows)
|
406
538
|
actual_new = len(new_rows)
|
407
539
|
else:
|
408
|
-
logger.info(f"All {num_rows} rows were duplicates,
|
409
|
-
|
540
|
+
logger.info(f"All {num_rows} rows were duplicates, exiting")
|
541
|
+
raise SystemError("No duplicates can be submitted")
|
410
542
|
else:
|
411
|
-
# Write new file
|
543
|
+
# Write new file with all fields
|
412
544
|
pq.write_table(table, self.captions_path, compression="snappy")
|
413
545
|
actual_new = num_rows
|
414
546
|
|
547
|
+
# Update statistics
|
415
548
|
self.total_captions_written += actual_new
|
416
549
|
self.total_caption_entries_written += total_outputs
|
417
550
|
self.total_flushes += 1
|
418
551
|
self.caption_buffer.clear()
|
419
552
|
|
553
|
+
# Track row additions for rate calculation
|
554
|
+
if actual_new > 0:
|
555
|
+
current_time = time.time()
|
556
|
+
self.row_additions.append((current_time, actual_new))
|
557
|
+
|
558
|
+
# Log rates
|
559
|
+
self._log_rates(actual_new)
|
560
|
+
|
420
561
|
logger.info(
|
421
|
-
f"Successfully wrote captions (rows: {
|
422
|
-
f"total
|
423
|
-
f"
|
562
|
+
f"Successfully wrote captions (new rows: {actual_new}, "
|
563
|
+
f"total rows written: {self.total_captions_written}, "
|
564
|
+
f"total captions written: {self.total_caption_entries_written}, "
|
565
|
+
f"duplicates skipped: {self.duplicates_skipped}, "
|
566
|
+
f"output fields: {sorted(list(self.known_output_fields))})"
|
424
567
|
)
|
425
568
|
|
569
|
+
async def optimize_storage(self):
|
570
|
+
"""Optimize storage by dropping empty columns. Run this periodically or on-demand."""
|
571
|
+
if not self.captions_path.exists():
|
572
|
+
logger.info("No captions file to optimize")
|
573
|
+
return
|
574
|
+
|
575
|
+
logger.info("Starting storage optimization...")
|
576
|
+
|
577
|
+
# Read the full table
|
578
|
+
table = pq.read_table(self.captions_path)
|
579
|
+
df = table.to_pandas()
|
580
|
+
original_columns = len(df.columns)
|
581
|
+
|
582
|
+
# Find non-empty columns (don't preserve empty base fields)
|
583
|
+
non_empty_columns = self._get_non_empty_columns(df, preserve_base_fields=False)
|
584
|
+
|
585
|
+
# Always keep at least job_id
|
586
|
+
if "job_id" not in non_empty_columns:
|
587
|
+
non_empty_columns.append("job_id")
|
588
|
+
|
589
|
+
if len(non_empty_columns) < original_columns:
|
590
|
+
# We have columns to drop
|
591
|
+
df_optimized = df[non_empty_columns]
|
592
|
+
|
593
|
+
# Rebuild schema for non-empty columns only
|
594
|
+
base_field_names = {f[0] for f in self.base_caption_fields}
|
595
|
+
fields = []
|
596
|
+
output_fields = set()
|
597
|
+
|
598
|
+
# Process columns in a consistent order: base fields first, then output fields
|
599
|
+
for col in non_empty_columns:
|
600
|
+
if col in base_field_names:
|
601
|
+
# Find the base field definition
|
602
|
+
for fname, ftype in self.base_caption_fields:
|
603
|
+
if fname == col:
|
604
|
+
fields.append((fname, ftype))
|
605
|
+
break
|
606
|
+
else:
|
607
|
+
# Output field
|
608
|
+
output_fields.add(col)
|
609
|
+
|
610
|
+
# Add output fields in sorted order
|
611
|
+
for field_name in sorted(output_fields):
|
612
|
+
fields.append((field_name, pa.list_(pa.string())))
|
613
|
+
|
614
|
+
# Create optimized schema and table
|
615
|
+
optimized_schema = pa.schema(fields)
|
616
|
+
optimized_table = pa.Table.from_pandas(df_optimized, schema=optimized_schema)
|
617
|
+
|
618
|
+
# Backup the original file (optional)
|
619
|
+
backup_path = self.captions_path.with_suffix(".parquet.bak")
|
620
|
+
import shutil
|
621
|
+
|
622
|
+
shutil.copy2(self.captions_path, backup_path)
|
623
|
+
|
624
|
+
# Write optimized table
|
625
|
+
pq.write_table(optimized_table, self.captions_path, compression="snappy")
|
626
|
+
|
627
|
+
# Update known output fields
|
628
|
+
self.known_output_fields = output_fields
|
629
|
+
|
630
|
+
# Clean up backup (optional - keep it for safety)
|
631
|
+
# backup_path.unlink()
|
632
|
+
|
633
|
+
logger.info(
|
634
|
+
f"Storage optimization complete: {original_columns} -> {len(non_empty_columns)} columns. "
|
635
|
+
f"Removed columns: {sorted(set(df.columns) - set(non_empty_columns))}"
|
636
|
+
)
|
637
|
+
else:
|
638
|
+
logger.info(f"No optimization needed - all {original_columns} columns contain data")
|
639
|
+
|
640
|
+
# Report file size reduction
|
641
|
+
import os
|
642
|
+
|
643
|
+
if backup_path and backup_path.exists():
|
644
|
+
original_size = os.path.getsize(backup_path)
|
645
|
+
new_size = os.path.getsize(self.captions_path)
|
646
|
+
reduction_pct = (1 - new_size / original_size) * 100
|
647
|
+
logger.info(
|
648
|
+
f"File size: {original_size/1024/1024:.1f}MB -> {new_size/1024/1024:.1f}MB "
|
649
|
+
f"({reduction_pct:.1f}% reduction)"
|
650
|
+
)
|
651
|
+
|
426
652
|
async def _evolve_schema_on_disk(self):
|
427
|
-
"""Evolve the schema of the existing parquet file to include new columns."""
|
653
|
+
"""Evolve the schema of the existing parquet file to include new columns, removing empty ones."""
|
428
654
|
logger.info("Evolving schema on disk to add new columns...")
|
429
655
|
|
430
656
|
# Read existing data
|
@@ -437,63 +663,24 @@ class StorageManager:
|
|
437
663
|
df[field_name] = None
|
438
664
|
logger.info(f"Added new column: {field_name}")
|
439
665
|
|
440
|
-
#
|
441
|
-
|
442
|
-
|
443
|
-
logger.info("Schema evolution complete")
|
666
|
+
# Remove empty columns (but preserve base fields)
|
667
|
+
non_empty_columns = self._get_non_empty_columns(df, preserve_base_fields=True)
|
668
|
+
df = df[non_empty_columns]
|
444
669
|
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
for buffered in self.caption_buffer:
|
449
|
-
if buffered["job_id"] == job_id:
|
450
|
-
outputs = {}
|
451
|
-
for field_name in self.known_output_fields:
|
452
|
-
if field_name in buffered and buffered[field_name]:
|
453
|
-
outputs[field_name] = buffered[field_name]
|
454
|
-
return outputs
|
455
|
-
|
456
|
-
if not self.captions_path.exists():
|
457
|
-
return None
|
458
|
-
|
459
|
-
table = pq.read_table(self.captions_path)
|
460
|
-
df = table.to_pandas()
|
670
|
+
# Update known output fields
|
671
|
+
base_field_names = {field[0] for field in self.base_caption_fields}
|
672
|
+
self.known_output_fields = set(non_empty_columns) - base_field_names
|
461
673
|
|
462
|
-
|
463
|
-
|
464
|
-
return None
|
674
|
+
# Recreate schema with only non-empty fields
|
675
|
+
self.caption_schema = self._build_caption_schema(self.known_output_fields)
|
465
676
|
|
466
|
-
#
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
if pd.notna(value) and value is not None:
|
472
|
-
outputs[field_name] = value
|
473
|
-
|
474
|
-
return outputs if outputs else None
|
475
|
-
|
476
|
-
async def save_job(self, job: Job):
|
477
|
-
"""Save or update a job - buffers until batch size reached."""
|
478
|
-
# For updates, we still add to buffer (will be handled in flush)
|
479
|
-
self.job_buffer.append(
|
480
|
-
{
|
481
|
-
"job_id": job.job_id,
|
482
|
-
"dataset": job.dataset,
|
483
|
-
"shard": job.shard,
|
484
|
-
"item_key": job.item_key,
|
485
|
-
"status": job.status.value,
|
486
|
-
"assigned_to": job.assigned_to,
|
487
|
-
"created_at": job.created_at,
|
488
|
-
"updated_at": datetime.utcnow(),
|
489
|
-
}
|
677
|
+
# Recreate table with new schema
|
678
|
+
evolved_table = pa.Table.from_pandas(df, schema=self.caption_schema)
|
679
|
+
pq.write_table(evolved_table, self.captions_path, compression="snappy")
|
680
|
+
logger.info(
|
681
|
+
f"Schema evolution complete. Active output fields: {sorted(list(self.known_output_fields))}"
|
490
682
|
)
|
491
683
|
|
492
|
-
self.existing_job_ids.add(job.job_id)
|
493
|
-
|
494
|
-
if len(self.job_buffer) >= self.job_buffer_size:
|
495
|
-
await self._flush_jobs()
|
496
|
-
|
497
684
|
async def save_contributor(self, contributor: Contributor):
|
498
685
|
"""Save or update contributor stats - buffers until batch size reached."""
|
499
686
|
self.contributor_buffer.append(asdict(contributor))
|
@@ -570,84 +757,51 @@ class StorageManager:
|
|
570
757
|
await self._flush_jobs()
|
571
758
|
await self._flush_contributors()
|
572
759
|
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
return False
|
590
|
-
|
591
|
-
async def get_job(self, job_id: str) -> Optional[Job]:
|
592
|
-
"""Retrieve a job by ID."""
|
593
|
-
# Check buffer first
|
594
|
-
for buffered in self.job_buffer:
|
595
|
-
if buffered["job_id"] == job_id:
|
596
|
-
return Job(
|
597
|
-
job_id=buffered["job_id"],
|
598
|
-
dataset=buffered["dataset"],
|
599
|
-
shard=buffered["shard"],
|
600
|
-
item_key=buffered["item_key"],
|
601
|
-
status=JobStatus(buffered["status"]),
|
602
|
-
assigned_to=buffered["assigned_to"],
|
603
|
-
created_at=buffered["created_at"],
|
604
|
-
)
|
760
|
+
# Log final rate statistics
|
761
|
+
if self.total_captions_written > 0:
|
762
|
+
rates = self._calculate_rates()
|
763
|
+
logger.info(
|
764
|
+
f"Checkpoint complete. Total rows: {self.total_captions_written}, "
|
765
|
+
f"Total caption entries: {self.total_caption_entries_written}, "
|
766
|
+
f"Duplicates skipped: {self.duplicates_skipped} | "
|
767
|
+
f"Overall rate: {rates['overall']:.1f} rows/s"
|
768
|
+
)
|
769
|
+
else:
|
770
|
+
logger.info(
|
771
|
+
f"Checkpoint complete. Total rows: {self.total_captions_written}, "
|
772
|
+
f"Total caption entries: {self.total_caption_entries_written}, "
|
773
|
+
f"Duplicates skipped: {self.duplicates_skipped}"
|
774
|
+
)
|
605
775
|
|
606
|
-
|
607
|
-
|
776
|
+
def get_all_processed_job_ids(self) -> Set[str]:
|
777
|
+
"""Get all processed job_ids - useful for resumption."""
|
778
|
+
if not self.captions_path.exists():
|
779
|
+
logger.info("No captions file found, returning empty processed job_ids set")
|
780
|
+
return set()
|
608
781
|
|
609
|
-
|
610
|
-
|
782
|
+
# Read only the job_id column
|
783
|
+
table = pq.read_table(self.captions_path, columns=["job_id"])
|
784
|
+
job_ids = set(table["job_id"].to_pylist())
|
611
785
|
|
612
|
-
|
613
|
-
|
614
|
-
|
786
|
+
# Add buffered job_ids
|
787
|
+
for row in self.caption_buffer:
|
788
|
+
if "job_id" in row:
|
789
|
+
job_ids.add(row["job_id"])
|
615
790
|
|
616
|
-
return
|
617
|
-
job_id=row.iloc[0]["job_id"],
|
618
|
-
dataset=row.iloc[0]["dataset"],
|
619
|
-
shard=row.iloc[0]["shard"],
|
620
|
-
item_key=row.iloc[0]["item_key"],
|
621
|
-
status=JobStatus(row.iloc[0]["status"]),
|
622
|
-
assigned_to=row.iloc[0]["assigned_to"],
|
623
|
-
created_at=row.iloc[0]["created_at"],
|
624
|
-
)
|
791
|
+
return job_ids
|
625
792
|
|
626
|
-
async def
|
627
|
-
"""Get all
|
628
|
-
if not self.
|
629
|
-
return
|
793
|
+
async def get_processed_jobs_for_chunk(self, chunk_id: str) -> Set[str]:
|
794
|
+
"""Get all processed job_ids for a given chunk."""
|
795
|
+
if not self.captions_path.exists():
|
796
|
+
return set()
|
630
797
|
|
631
|
-
|
798
|
+
# Read only job_id and chunk_id columns
|
799
|
+
table = pq.read_table(self.captions_path, columns=["job_id", "chunk_id"])
|
632
800
|
df = table.to_pandas()
|
633
801
|
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
for _, row in rows.iterrows():
|
638
|
-
jobs.append(
|
639
|
-
Job(
|
640
|
-
job_id=row["job_id"],
|
641
|
-
dataset=row["dataset"],
|
642
|
-
shard=row["shard"],
|
643
|
-
item_key=row["item_key"],
|
644
|
-
status=JobStatus(row["status"]),
|
645
|
-
assigned_to=row["assigned_to"],
|
646
|
-
created_at=row["created_at"],
|
647
|
-
)
|
648
|
-
)
|
649
|
-
|
650
|
-
return jobs
|
802
|
+
# Filter by chunk_id and return job_ids
|
803
|
+
chunk_jobs = df[df["chunk_id"] == chunk_id]["job_id"].tolist()
|
804
|
+
return set(chunk_jobs)
|
651
805
|
|
652
806
|
async def get_caption_stats(self) -> Dict[str, Any]:
|
653
807
|
"""Get statistics about stored captions including field-specific stats."""
|
@@ -678,11 +832,12 @@ class StorageManager:
|
|
678
832
|
field_lengths = []
|
679
833
|
|
680
834
|
for value in df.loc[non_null_mask, field_name]:
|
835
|
+
# list or array-like
|
681
836
|
if isinstance(value, list):
|
682
837
|
length = len(value)
|
683
838
|
field_total += length
|
684
839
|
field_lengths.append(length)
|
685
|
-
elif
|
840
|
+
elif value.any():
|
686
841
|
length = 1
|
687
842
|
field_total += length
|
688
843
|
field_lengths.append(length)
|
@@ -714,46 +869,6 @@ class StorageManager:
|
|
714
869
|
},
|
715
870
|
}
|
716
871
|
|
717
|
-
async def get_sample_captions(self, n: int = 5) -> List[Dict[str, Any]]:
|
718
|
-
"""Get a sample of caption entries showing all output fields."""
|
719
|
-
if not self.captions_path.exists():
|
720
|
-
return []
|
721
|
-
|
722
|
-
table = pq.read_table(self.captions_path)
|
723
|
-
df = table.to_pandas()
|
724
|
-
|
725
|
-
if len(df) == 0:
|
726
|
-
return []
|
727
|
-
|
728
|
-
sample_df = df.sample(min(n, len(df)))
|
729
|
-
samples = []
|
730
|
-
|
731
|
-
for _, row in sample_df.iterrows():
|
732
|
-
# Collect outputs from dynamic columns
|
733
|
-
outputs = {}
|
734
|
-
total_outputs = 0
|
735
|
-
|
736
|
-
for field_name in self.known_output_fields:
|
737
|
-
if field_name in row and pd.notna(row[field_name]):
|
738
|
-
value = row[field_name]
|
739
|
-
outputs[field_name] = value
|
740
|
-
if isinstance(value, list):
|
741
|
-
total_outputs += len(value)
|
742
|
-
|
743
|
-
samples.append(
|
744
|
-
{
|
745
|
-
"job_id": row["job_id"],
|
746
|
-
"item_key": row["item_key"],
|
747
|
-
"outputs": outputs,
|
748
|
-
"field_count": len(outputs),
|
749
|
-
"total_outputs": total_outputs,
|
750
|
-
"image_dims": f"{row.get('image_width', 'N/A')}x{row.get('image_height', 'N/A')}",
|
751
|
-
"has_metadata": bool(row.get("metadata") and row["metadata"] != "{}"),
|
752
|
-
}
|
753
|
-
)
|
754
|
-
|
755
|
-
return samples
|
756
|
-
|
757
872
|
async def count_captions(self) -> int:
|
758
873
|
"""Count total outputs across all dynamic fields."""
|
759
874
|
total = 0
|
@@ -883,142 +998,26 @@ class StorageManager:
|
|
883
998
|
"fields": sorted(list(field_counts.keys())),
|
884
999
|
}
|
885
1000
|
|
886
|
-
async def get_captions_with_field(
|
887
|
-
self, field_name: str, limit: int = 100
|
888
|
-
) -> List[Dict[str, Any]]:
|
889
|
-
"""Get captions that have a specific output field."""
|
890
|
-
if not self.captions_path.exists():
|
891
|
-
return []
|
892
|
-
|
893
|
-
if field_name not in self.known_output_fields:
|
894
|
-
logger.warning(f"Field '{field_name}' not found in known output fields")
|
895
|
-
return []
|
896
|
-
|
897
|
-
# Check if the field actually exists in the file
|
898
|
-
existing_output_columns = self._get_existing_output_columns()
|
899
|
-
if field_name not in existing_output_columns:
|
900
|
-
logger.warning(
|
901
|
-
f"Field '{field_name}' exists in known fields but not in parquet file yet"
|
902
|
-
)
|
903
|
-
return []
|
904
|
-
|
905
|
-
# Only read necessary columns
|
906
|
-
columns_to_read = ["job_id", "item_key", field_name]
|
907
|
-
|
908
|
-
try:
|
909
|
-
table = pq.read_table(self.captions_path, columns=columns_to_read)
|
910
|
-
except Exception as e:
|
911
|
-
logger.error(f"Error reading field '{field_name}': {e}")
|
912
|
-
return []
|
913
|
-
|
914
|
-
df = table.to_pandas()
|
915
|
-
|
916
|
-
# Filter rows where field has data
|
917
|
-
mask = df[field_name].notna()
|
918
|
-
filtered_df = df[mask].head(limit)
|
919
|
-
|
920
|
-
results = []
|
921
|
-
for _, row in filtered_df.iterrows():
|
922
|
-
results.append(
|
923
|
-
{
|
924
|
-
"job_id": row["job_id"],
|
925
|
-
"item_key": row["item_key"],
|
926
|
-
field_name: row[field_name],
|
927
|
-
"value_count": len(row[field_name]) if isinstance(row[field_name], list) else 1,
|
928
|
-
}
|
929
|
-
)
|
930
|
-
|
931
|
-
return results
|
932
|
-
|
933
|
-
async def export_by_field(self, field_name: str, output_path: Path, format: str = "jsonl"):
|
934
|
-
"""Export all captions for a specific field."""
|
935
|
-
if not self.captions_path.exists():
|
936
|
-
logger.warning("No captions to export")
|
937
|
-
return 0
|
938
|
-
|
939
|
-
if field_name not in self.known_output_fields:
|
940
|
-
logger.warning(f"Field '{field_name}' not found in known output fields")
|
941
|
-
return 0
|
942
|
-
|
943
|
-
# Check if the field actually exists in the file
|
944
|
-
existing_output_columns = self._get_existing_output_columns()
|
945
|
-
if field_name not in existing_output_columns:
|
946
|
-
logger.warning(f"Field '{field_name}' not found in parquet file")
|
947
|
-
return 0
|
948
|
-
|
949
|
-
# Read only necessary columns
|
950
|
-
columns_to_read = ["item_key", "dataset", field_name]
|
951
|
-
table = pq.read_table(self.captions_path, columns=columns_to_read)
|
952
|
-
df = table.to_pandas()
|
953
|
-
|
954
|
-
exported = 0
|
955
|
-
with open(output_path, "w") as f:
|
956
|
-
for _, row in df.iterrows():
|
957
|
-
if pd.notna(row[field_name]) and row[field_name]:
|
958
|
-
if format == "jsonl":
|
959
|
-
record = {
|
960
|
-
"item_key": row["item_key"],
|
961
|
-
"dataset": row["dataset"],
|
962
|
-
field_name: row[field_name],
|
963
|
-
}
|
964
|
-
f.write(json.dumps(record) + "\n")
|
965
|
-
exported += 1
|
966
|
-
|
967
|
-
logger.info(f"Exported {exported} items with field '{field_name}' to {output_path}")
|
968
|
-
return exported
|
969
|
-
|
970
|
-
async def get_pending_jobs(self) -> List[Job]:
|
971
|
-
"""Get all pending jobs for restoration on startup."""
|
972
|
-
if not self.jobs_path.exists():
|
973
|
-
return []
|
974
|
-
|
975
|
-
table = pq.read_table(self.jobs_path)
|
976
|
-
df = table.to_pandas()
|
977
|
-
|
978
|
-
# Get jobs with PENDING or PROCESSING status
|
979
|
-
pending_df = df[df["status"].isin([JobStatus.PENDING.value, JobStatus.PROCESSING.value])]
|
980
|
-
|
981
|
-
jobs = []
|
982
|
-
for _, row in pending_df.iterrows():
|
983
|
-
jobs.append(
|
984
|
-
Job(
|
985
|
-
job_id=row["job_id"],
|
986
|
-
dataset=row["dataset"],
|
987
|
-
shard=row["shard"],
|
988
|
-
item_key=row["item_key"],
|
989
|
-
status=JobStatus(row["status"]),
|
990
|
-
assigned_to=row.get("assigned_to"),
|
991
|
-
created_at=row["created_at"],
|
992
|
-
)
|
993
|
-
)
|
994
|
-
|
995
|
-
return jobs
|
996
|
-
|
997
|
-
async def count_jobs(self) -> int:
|
998
|
-
"""Count total jobs."""
|
999
|
-
if not self.jobs_path.exists():
|
1000
|
-
return 0
|
1001
|
-
|
1002
|
-
table = pq.read_table(self.jobs_path)
|
1003
|
-
return len(table)
|
1004
|
-
|
1005
|
-
async def count_completed_jobs(self) -> int:
|
1006
|
-
"""Count completed jobs."""
|
1007
|
-
if not self.jobs_path.exists():
|
1008
|
-
return 0
|
1009
|
-
|
1010
|
-
table = pq.read_table(self.jobs_path)
|
1011
|
-
df = table.to_pandas()
|
1012
|
-
return len(df[df["status"] == JobStatus.COMPLETED.value])
|
1013
|
-
|
1014
1001
|
async def close(self):
|
1015
1002
|
"""Close storage and flush buffers."""
|
1016
1003
|
await self.checkpoint()
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1004
|
+
|
1005
|
+
# Log final rate statistics
|
1006
|
+
if self.total_captions_written > 0:
|
1007
|
+
rates = self._calculate_rates()
|
1008
|
+
logger.info(
|
1009
|
+
f"Storage closed. Total rows: {self.total_captions_written}, "
|
1010
|
+
f"Total caption entries: {self.total_caption_entries_written}, "
|
1011
|
+
f"Duplicates skipped: {self.duplicates_skipped} | "
|
1012
|
+
f"Final rates - Overall: {rates['overall']:.1f} rows/s, "
|
1013
|
+
f"Last hour: {rates['60min']:.1f} rows/s"
|
1014
|
+
)
|
1015
|
+
else:
|
1016
|
+
logger.info(
|
1017
|
+
f"Storage closed. Total rows: {self.total_captions_written}, "
|
1018
|
+
f"Total caption entries: {self.total_caption_entries_written}, "
|
1019
|
+
f"Duplicates skipped: {self.duplicates_skipped}"
|
1020
|
+
)
|
1022
1021
|
|
1023
1022
|
async def get_storage_stats(self) -> Dict[str, Any]:
|
1024
1023
|
"""Get all storage-related statistics."""
|
@@ -1036,6 +1035,9 @@ class StorageManager:
|
|
1036
1035
|
field_stats = await self.get_caption_stats()
|
1037
1036
|
total_rows_including_buffer = await self.count_caption_rows() + len(self.caption_buffer)
|
1038
1037
|
|
1038
|
+
# Calculate rates
|
1039
|
+
rates = self._calculate_rates()
|
1040
|
+
|
1039
1041
|
return {
|
1040
1042
|
"total_captions": disk_outputs + buffer_outputs,
|
1041
1043
|
"total_rows": total_rows_including_buffer,
|
@@ -1048,4 +1050,11 @@ class StorageManager:
|
|
1048
1050
|
"field_breakdown": field_stats.get("field_stats", None),
|
1049
1051
|
"job_buffer_size": len(self.job_buffer),
|
1050
1052
|
"contributor_buffer_size": len(self.contributor_buffer),
|
1053
|
+
"rates": {
|
1054
|
+
"instant": f"{rates['instant']:.1f} rows/s",
|
1055
|
+
"5min": f"{rates['5min']:.1f} rows/s",
|
1056
|
+
"15min": f"{rates['15min']:.1f} rows/s",
|
1057
|
+
"60min": f"{rates['60min']:.1f} rows/s",
|
1058
|
+
"overall": f"{rates['overall']:.1f} rows/s",
|
1059
|
+
},
|
1051
1060
|
}
|