caption-flow 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- caption_flow/cli.py +8 -2
- caption_flow/monitor.py +1 -1
- caption_flow/orchestrator.py +522 -129
- caption_flow/storage.py +5 -0
- caption_flow/utils/chunk_tracker.py +22 -4
- caption_flow/utils/dataset_loader.py +99 -142
- caption_flow/utils/shard_processor.py +100 -36
- {caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/METADATA +2 -1
- {caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/RECORD +13 -13
- {caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/WHEEL +0 -0
- {caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/entry_points.txt +0 -0
- {caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/top_level.txt +0 -0
caption_flow/storage.py
CHANGED
@@ -386,10 +386,15 @@ class StorageManager:
|
|
386
386
|
|
387
387
|
# Filter new data to exclude duplicates
|
388
388
|
new_rows = []
|
389
|
+
duplicate_rows = []
|
389
390
|
for row in prepared_buffer:
|
390
391
|
if row["job_id"] not in existing_job_ids:
|
391
392
|
new_rows.append(row)
|
393
|
+
elif row not in duplicate_rows:
|
394
|
+
duplicate_rows.append(row)
|
392
395
|
|
396
|
+
if duplicate_rows:
|
397
|
+
logger.info(f"Example duplicate row: {duplicate_rows[0]}")
|
393
398
|
if new_rows:
|
394
399
|
# Create table from new rows only
|
395
400
|
new_table = pa.Table.from_pylist(new_rows, schema=self.caption_schema)
|
@@ -441,9 +441,27 @@ class ChunkTracker(CheckpointTracker):
|
|
441
441
|
)
|
442
442
|
|
443
443
|
def get_chunk_with_unprocessed_items(self, chunk_id: str) -> Optional[Dict[str, Any]]:
|
444
|
-
"""Get chunk info
|
445
|
-
|
444
|
+
"""Get chunk info with unprocessed item ranges."""
|
445
|
+
chunk_state = self.chunks.get(chunk_id)
|
446
|
+
if not chunk_state:
|
446
447
|
return None
|
447
448
|
|
448
|
-
|
449
|
-
|
449
|
+
# During startup or if no worker is assigned, treat all unprocessed as available
|
450
|
+
if not hasattr(self, "_startup_complete"):
|
451
|
+
self._startup_complete = False
|
452
|
+
|
453
|
+
if not self._startup_complete or not chunk_state.assigned_to:
|
454
|
+
# Return all unprocessed ranges
|
455
|
+
return {
|
456
|
+
"chunk_id": chunk_id,
|
457
|
+
"unprocessed_ranges": chunk_state.get_unprocessed_ranges(),
|
458
|
+
"status": chunk_state.status,
|
459
|
+
}
|
460
|
+
|
461
|
+
# Normal operation - only return ranges not being worked on
|
462
|
+
# This would need more complex tracking of which ranges each worker is processing
|
463
|
+
return {
|
464
|
+
"chunk_id": chunk_id,
|
465
|
+
"unprocessed_ranges": chunk_state.get_unprocessed_ranges(),
|
466
|
+
"status": chunk_state.status,
|
467
|
+
}
|
@@ -217,200 +217,157 @@ class DatasetLoader:
|
|
217
217
|
return dataset_path, start_idx, chunk_size
|
218
218
|
|
219
219
|
def iterate_shard(
|
220
|
-
self,
|
220
|
+
self,
|
221
|
+
shard_url: str,
|
222
|
+
processed_keys: Optional[set] = None,
|
223
|
+
unprocessed_ranges: Optional[List[Tuple[int, int]]] = None,
|
221
224
|
) -> Generator[Tuple[str, str, bytes], None, None]:
|
222
225
|
"""
|
223
226
|
Iterate over items in a shard.
|
224
227
|
|
228
|
+
Args:
|
229
|
+
shard_url: URL or identifier of the shard
|
230
|
+
processed_keys: Set of already processed keys to skip
|
231
|
+
unprocessed_ranges: Specific ranges to process (for HF datasets)
|
232
|
+
|
225
233
|
Yields:
|
226
234
|
Tuple of (key, url, image_bytes)
|
227
235
|
"""
|
228
|
-
# Check if this is a virtual HuggingFace dataset shard
|
229
236
|
if shard_url.startswith("hf_dataset:"):
|
230
|
-
|
237
|
+
raise ValueError(
|
238
|
+
"Virtual HuggingFace dataset shards should use iterate_shard_with_metadata()"
|
239
|
+
)
|
231
240
|
else:
|
232
241
|
# Regular WebDataset shard
|
233
242
|
ds = self.load_shard(shard_url, processed_keys)
|
234
243
|
for key, url, image_data in ds:
|
235
244
|
yield key, url, image_data
|
236
245
|
|
237
|
-
def
|
238
|
-
|
239
|
-
) -> Generator[Tuple[str, str, bytes, Dict[str, Any]], None, None]:
|
240
|
-
"""Iterate over a virtual HuggingFace dataset shard with metadata."""
|
241
|
-
if processed_keys is None:
|
242
|
-
processed_keys = set()
|
243
|
-
|
244
|
-
dataset_path, start_idx, chunk_size = self._parse_virtual_shard(shard_url)
|
245
|
-
|
246
|
-
logger.info(
|
247
|
-
f"Loading HuggingFace dataset with metadata: {dataset_path} (split: {self.split})"
|
248
|
-
)
|
249
|
-
|
246
|
+
def _create_dataset_at_position(self, dataset_path: str, split: str, start_idx: int):
|
247
|
+
"""Create a dataset iterator positioned at start_idx using state_dict if available."""
|
250
248
|
try:
|
251
249
|
# Load dataset in streaming mode
|
252
250
|
dataset = load_dataset(
|
253
251
|
dataset_path,
|
254
|
-
split=
|
252
|
+
split=split,
|
255
253
|
streaming=True,
|
256
254
|
token=self.token,
|
257
255
|
)
|
258
256
|
|
259
|
-
#
|
260
|
-
if
|
261
|
-
dataset
|
262
|
-
|
263
|
-
items_processed = 0
|
264
|
-
|
265
|
-
for item in dataset:
|
266
|
-
# Stop after processing chunk_size items
|
267
|
-
if items_processed >= chunk_size:
|
268
|
-
break
|
269
|
-
|
270
|
-
# Generate a unique key for this item - CONSISTENT FORMAT
|
271
|
-
key = f"{dataset_path.replace('/', '_')}_{start_idx + items_processed:08d}"
|
272
|
-
|
273
|
-
if key in processed_keys:
|
274
|
-
items_processed += 1
|
275
|
-
continue
|
276
|
-
|
257
|
+
# Check if the dataset supports state_dict (newer versions of datasets library)
|
258
|
+
if hasattr(dataset, "load_state_dict") and hasattr(dataset, "state_dict"):
|
259
|
+
# Try to use the dataset's native state management
|
277
260
|
try:
|
278
|
-
#
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
261
|
+
# Get current state
|
262
|
+
state = dataset.state_dict()
|
263
|
+
|
264
|
+
# Modify the state to skip to start_idx
|
265
|
+
if "epoch" in state:
|
266
|
+
state["epoch"] = 0
|
267
|
+
if "num_examples_since_previous_state" in state:
|
268
|
+
state["num_examples_since_previous_state"] = start_idx
|
269
|
+
|
270
|
+
# For newer datasets with examples_iterable state
|
271
|
+
if "examples_iterable" in state:
|
272
|
+
if isinstance(state["examples_iterable"], dict):
|
273
|
+
if "shard_example_idx" in state["examples_iterable"]:
|
274
|
+
state["examples_iterable"]["shard_example_idx"] = start_idx
|
275
|
+
|
276
|
+
# Load the modified state
|
277
|
+
dataset.load_state_dict(state)
|
278
|
+
logger.info(f"Positioned dataset at index {start_idx} using state_dict")
|
279
|
+
return dataset
|
280
|
+
except Exception as e:
|
281
|
+
logger.debug(f"Could not use state_dict approach: {e}")
|
288
282
|
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
else:
|
294
|
-
logger.warning(
|
295
|
-
f"Failed to process image for item at index {start_idx + items_processed}"
|
296
|
-
)
|
297
|
-
items_processed += 1
|
298
|
-
continue
|
299
|
-
else:
|
300
|
-
logger.warning(
|
301
|
-
f"No image column '{self.image_column}' found in item at index {start_idx + items_processed}. "
|
302
|
-
f"Available columns: {list(item.keys())}"
|
303
|
-
)
|
304
|
-
items_processed += 1
|
283
|
+
# Fall back to skip() for large skips
|
284
|
+
if start_idx > 0:
|
285
|
+
logger.info(f"Using skip() to position dataset at index {start_idx}")
|
286
|
+
dataset = dataset.skip(start_idx)
|
305
287
|
|
306
|
-
|
307
|
-
logger.error(
|
308
|
-
f"Error processing item at index {start_idx + items_processed}: {e}"
|
309
|
-
)
|
310
|
-
items_processed += 1
|
311
|
-
continue
|
288
|
+
return dataset
|
312
289
|
|
313
290
|
except Exception as e:
|
314
|
-
logger.
|
315
|
-
return
|
291
|
+
logger.warning(f"Error creating positioned dataset: {e}")
|
292
|
+
return None
|
316
293
|
|
317
|
-
def
|
294
|
+
def _iterate_hf_dataset_shard_with_metadata(
|
318
295
|
self, shard_url: str, processed_keys: Optional[set] = None
|
319
|
-
) -> Generator[Tuple[str, str, bytes], None, None]:
|
320
|
-
"""Iterate over a virtual HuggingFace dataset shard."""
|
296
|
+
) -> Generator[Tuple[str, str, bytes, Dict[str, Any]], None, None]:
|
297
|
+
"""Iterate over a virtual HuggingFace dataset shard with metadata."""
|
321
298
|
if processed_keys is None:
|
322
299
|
processed_keys = set()
|
323
300
|
|
324
301
|
dataset_path, start_idx, chunk_size = self._parse_virtual_shard(shard_url)
|
325
302
|
|
326
|
-
# IMPORTANT: Check if start_idx is beyond dataset bounds
|
327
|
-
if self._hf_total_items is not None and start_idx >= self._hf_total_items:
|
328
|
-
logger.warning(
|
329
|
-
f"Virtual shard starts at index {start_idx} but dataset only has "
|
330
|
-
f"{self._hf_total_items} items. Skipping this shard."
|
331
|
-
)
|
332
|
-
return
|
333
|
-
|
334
303
|
logger.info(
|
335
|
-
f"Loading HuggingFace dataset
|
336
|
-
f"(split: {self.split}, start: {start_idx}, chunk_size: {chunk_size})"
|
304
|
+
f"Loading HuggingFace dataset with metadata: {dataset_path} (split: {self.split})"
|
337
305
|
)
|
338
306
|
|
339
307
|
try:
|
340
|
-
#
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
308
|
+
# For HF datasets, we iterate through the full chunk range
|
309
|
+
# The actual range filtering happens in the shard processor
|
310
|
+
items_processed = 0
|
311
|
+
current_abs_idx = start_idx
|
312
|
+
|
313
|
+
while items_processed < chunk_size:
|
314
|
+
# Create a fresh dataset iterator for each batch
|
315
|
+
# This avoids issues with stateful iterators
|
316
|
+
batch_size = min(1000, chunk_size - items_processed) # Process in smaller batches
|
317
|
+
|
318
|
+
dataset = load_dataset(
|
319
|
+
dataset_path,
|
320
|
+
split=self.split,
|
321
|
+
streaming=True,
|
322
|
+
token=self.token,
|
323
|
+
)
|
347
324
|
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
logger.info(f"Skipped to index {start_idx}")
|
325
|
+
# Skip to current position
|
326
|
+
if current_abs_idx > 0:
|
327
|
+
dataset = dataset.skip(current_abs_idx)
|
352
328
|
|
353
|
-
|
329
|
+
batch_processed = 0
|
330
|
+
for item in dataset:
|
331
|
+
if batch_processed >= batch_size or items_processed >= chunk_size:
|
332
|
+
break
|
354
333
|
|
355
|
-
|
356
|
-
|
357
|
-
# Stop after processing chunk_size items
|
358
|
-
if items_processed >= chunk_size:
|
359
|
-
logger.info(f"Completed chunk: processed {items_processed} items")
|
360
|
-
break
|
361
|
-
|
362
|
-
# Also stop if we've reached the dataset end
|
363
|
-
if self._hf_total_items and (start_idx + items_processed) >= self._hf_total_items:
|
364
|
-
logger.info(
|
365
|
-
f"Reached dataset end at item {start_idx + items_processed} "
|
366
|
-
f"(total: {self._hf_total_items})"
|
367
|
-
)
|
368
|
-
break
|
334
|
+
# Generate key
|
335
|
+
key = f"{dataset_path.replace('/', '_')}_{current_abs_idx:08d}"
|
369
336
|
|
370
|
-
|
371
|
-
|
337
|
+
if key in processed_keys:
|
338
|
+
current_abs_idx += 1
|
339
|
+
batch_processed += 1
|
340
|
+
items_processed += 1
|
341
|
+
continue
|
372
342
|
|
373
|
-
|
374
|
-
|
375
|
-
|
343
|
+
try:
|
344
|
+
if self.image_column in item:
|
345
|
+
img_data = item[self.image_column]
|
346
|
+
image_bytes = ImageProcessor.process_image_data(img_data)
|
376
347
|
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
img_data = item[self.image_column]
|
348
|
+
if image_bytes:
|
349
|
+
metadata = {k: v for k, v in item.items() if k != self.image_column}
|
350
|
+
url = f"hf://{dataset_path}#{current_abs_idx}"
|
381
351
|
|
382
|
-
|
383
|
-
image_bytes = ImageProcessor.process_image_data(img_data)
|
352
|
+
yield key, url, image_bytes, metadata
|
384
353
|
|
385
|
-
|
386
|
-
|
387
|
-
url = f"hf://{dataset_path}#{start_idx + items_processed}"
|
354
|
+
current_abs_idx += 1
|
355
|
+
batch_processed += 1
|
388
356
|
items_processed += 1
|
389
|
-
yield key, url, image_bytes
|
390
357
|
else:
|
391
358
|
logger.warning(
|
392
|
-
f"
|
359
|
+
f"No image column '{self.image_column}' at index {current_abs_idx}"
|
393
360
|
)
|
361
|
+
current_abs_idx += 1
|
362
|
+
batch_processed += 1
|
394
363
|
items_processed += 1
|
395
|
-
continue
|
396
|
-
else:
|
397
|
-
logger.warning(
|
398
|
-
f"No image column '{self.image_column}' found in item at index {start_idx + items_processed}. "
|
399
|
-
f"Available columns: {list(item.keys())}"
|
400
|
-
)
|
401
|
-
items_processed += 1
|
402
364
|
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
logger.info(
|
411
|
-
f"Virtual shard complete: processed {items_processed} items "
|
412
|
-
f"(start_idx: {start_idx})"
|
413
|
-
)
|
365
|
+
except Exception as e:
|
366
|
+
logger.error(f"Error processing item at index {current_abs_idx}: {e}")
|
367
|
+
current_abs_idx += 1
|
368
|
+
batch_processed += 1
|
369
|
+
items_processed += 1
|
370
|
+
continue
|
414
371
|
|
415
372
|
except Exception as e:
|
416
373
|
logger.error(f"Error loading HuggingFace dataset: {e}")
|
@@ -7,6 +7,8 @@ from abc import ABC, abstractmethod
|
|
7
7
|
from pathlib import Path
|
8
8
|
from typing import Generator, Tuple, Optional, Dict, Any
|
9
9
|
from dataclasses import dataclass
|
10
|
+
from datasets import load_dataset
|
11
|
+
from .image_processor import ImageProcessor
|
10
12
|
from threading import Event
|
11
13
|
import shlex
|
12
14
|
|
@@ -108,10 +110,7 @@ class HFDatasetShardProcessor(ShardProcessor):
|
|
108
110
|
connected: Event,
|
109
111
|
) -> Generator[Tuple[str, str, bytes, Dict[str, Any]], None, None]:
|
110
112
|
"""
|
111
|
-
Process HuggingFace virtual shard chunk with metadata.
|
112
|
-
|
113
|
-
Yields:
|
114
|
-
Tuple of (key, url, image_data, metadata)
|
113
|
+
Process HuggingFace virtual shard chunk with metadata, range by range.
|
115
114
|
"""
|
116
115
|
if not dataset_loader:
|
117
116
|
logger.error("No dataset loader configured for HuggingFace dataset shard")
|
@@ -121,49 +120,114 @@ class HFDatasetShardProcessor(ShardProcessor):
|
|
121
120
|
unprocessed_ranges = getattr(chunk, "unprocessed_ranges", [(0, chunk.chunk_size - 1)])
|
122
121
|
|
123
122
|
logger.info(
|
124
|
-
f"Processing HF dataset chunk {chunk.chunk_id} with
|
123
|
+
f"Processing HF dataset chunk {chunk.chunk_id} with {len(unprocessed_ranges)} ranges"
|
125
124
|
)
|
126
125
|
|
127
|
-
|
128
|
-
current_idx = 0
|
129
|
-
|
130
|
-
# Construct proper virtual shard URL
|
131
|
-
parts = chunk.shard_url.split("_chunk_")
|
132
|
-
if len(parts) == 2:
|
133
|
-
base_path = parts[0]
|
134
|
-
virtual_shard_url = f"{base_path}:chunk:{chunk.start_index}"
|
135
|
-
else:
|
136
|
-
virtual_shard_url = chunk.shard_url
|
137
|
-
|
138
|
-
logger.debug(f"Using virtual shard URL: {virtual_shard_url}")
|
126
|
+
items_yielded = 0
|
139
127
|
|
140
|
-
#
|
141
|
-
for
|
142
|
-
virtual_shard_url
|
143
|
-
):
|
144
|
-
# Check if we should stop
|
128
|
+
# Process each range independently with its own iterator
|
129
|
+
for range_start, range_end in unprocessed_ranges:
|
145
130
|
if should_stop.is_set() or not connected.is_set():
|
146
131
|
logger.info(f"Stopping chunk processing early due to disconnect")
|
147
132
|
break
|
148
133
|
|
149
|
-
#
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
current_idx += 1
|
154
|
-
continue # Skip already processed items
|
134
|
+
# Calculate absolute indices for this range
|
135
|
+
abs_start = chunk.start_index + range_start
|
136
|
+
abs_end = chunk.start_index + range_end
|
137
|
+
range_size = range_end - range_start + 1
|
155
138
|
|
156
|
-
|
157
|
-
|
158
|
-
|
139
|
+
logger.debug(
|
140
|
+
f"Processing range [{range_start}, {range_end}] "
|
141
|
+
f"(absolute: [{abs_start}, {abs_end}])"
|
142
|
+
)
|
159
143
|
|
160
|
-
|
161
|
-
|
162
|
-
|
144
|
+
try:
|
145
|
+
# Create a fresh dataset iterator for this range
|
146
|
+
dataset = load_dataset(
|
147
|
+
dataset_loader.dataset_path,
|
148
|
+
split=dataset_loader.split,
|
149
|
+
streaming=True,
|
150
|
+
token=dataset_loader.token,
|
151
|
+
)
|
152
|
+
|
153
|
+
# Use state_dict if available for efficient positioning
|
154
|
+
if hasattr(dataset, "load_state_dict") and hasattr(dataset, "state_dict"):
|
155
|
+
try:
|
156
|
+
state = dataset.state_dict()
|
157
|
+
# Modify state to jump to abs_start
|
158
|
+
if "num_examples_since_previous_state" in state:
|
159
|
+
state["num_examples_since_previous_state"] = abs_start
|
160
|
+
if "examples_iterable" in state and isinstance(
|
161
|
+
state["examples_iterable"], dict
|
162
|
+
):
|
163
|
+
if "shard_example_idx" in state["examples_iterable"]:
|
164
|
+
state["examples_iterable"]["shard_example_idx"] = abs_start
|
165
|
+
dataset.load_state_dict(state)
|
166
|
+
logger.debug(f"Positioned dataset at index {abs_start} using state_dict")
|
167
|
+
except Exception as e:
|
168
|
+
logger.debug(f"Could not use state_dict, falling back to skip: {e}")
|
169
|
+
dataset = dataset.skip(abs_start)
|
170
|
+
else:
|
171
|
+
# Fall back to skip
|
172
|
+
dataset = dataset.skip(abs_start)
|
173
|
+
|
174
|
+
# Process items in this range
|
175
|
+
range_items = 0
|
176
|
+
for item in dataset:
|
177
|
+
if range_items >= range_size:
|
178
|
+
break
|
179
|
+
|
180
|
+
if should_stop.is_set() or not connected.is_set():
|
181
|
+
break
|
182
|
+
|
183
|
+
# Generate key for this item
|
184
|
+
current_abs_idx = abs_start + range_items
|
185
|
+
key = f"{dataset_loader.dataset_path.replace('/', '_')}_{current_abs_idx:08d}"
|
186
|
+
|
187
|
+
try:
|
188
|
+
if dataset_loader.image_column in item:
|
189
|
+
img_data = item[dataset_loader.image_column]
|
190
|
+
image_bytes = ImageProcessor.process_image_data(img_data)
|
191
|
+
|
192
|
+
if image_bytes:
|
193
|
+
# Extract metadata
|
194
|
+
metadata = {
|
195
|
+
k: v
|
196
|
+
for k, v in item.items()
|
197
|
+
if k != dataset_loader.image_column
|
198
|
+
}
|
199
|
+
# Add chunk-relative index to metadata
|
200
|
+
metadata["_chunk_relative_index"] = range_start + range_items
|
201
|
+
|
202
|
+
url = f"hf://{dataset_loader.dataset_path}#{current_abs_idx}"
|
203
|
+
|
204
|
+
items_yielded += 1
|
205
|
+
range_items += 1
|
206
|
+
|
207
|
+
yield key, url, image_bytes, metadata
|
208
|
+
else:
|
209
|
+
logger.warning(
|
210
|
+
f"Failed to process image at index {current_abs_idx}"
|
211
|
+
)
|
212
|
+
range_items += 1
|
213
|
+
else:
|
214
|
+
logger.warning(
|
215
|
+
f"No image column '{dataset_loader.image_column}' at index {current_abs_idx}"
|
216
|
+
)
|
217
|
+
range_items += 1
|
218
|
+
|
219
|
+
except Exception as e:
|
220
|
+
logger.error(f"Error processing item at index {current_abs_idx}: {e}")
|
221
|
+
range_items += 1
|
222
|
+
continue
|
223
|
+
|
224
|
+
except Exception as e:
|
225
|
+
logger.error(f"Error processing range [{range_start}, {range_end}]: {e}")
|
226
|
+
continue
|
163
227
|
|
164
228
|
logger.info(
|
165
|
-
f"HF dataset chunk {chunk.chunk_id}: yielded {
|
166
|
-
f"from
|
229
|
+
f"HF dataset chunk {chunk.chunk_id}: yielded {items_yielded} items "
|
230
|
+
f"from {len(unprocessed_ranges)} ranges"
|
167
231
|
)
|
168
232
|
|
169
233
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: caption-flow
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.2
|
4
4
|
Summary: Self-contained distributed community captioning system
|
5
5
|
Author-email: bghira <bghira@users.github.com>
|
6
6
|
License: MIT
|
@@ -32,6 +32,7 @@ Requires-Dist: pandas<3.0.0,>=2.3.1
|
|
32
32
|
Requires-Dist: arrow<2.0.0,>=1.3.0
|
33
33
|
Requires-Dist: datasets<5.0.0,>=4.0.0
|
34
34
|
Requires-Dist: boto3<2.0.0,>=1.40.11
|
35
|
+
Requires-Dist: torchdata<0.12.0,>=0.11.0
|
35
36
|
Provides-Extra: dev
|
36
37
|
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
37
38
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
@@ -1,29 +1,29 @@
|
|
1
1
|
caption_flow/__init__.py,sha256=NLPJ25lRN7xHqncXweINDNwbt0q8lgjZ30G21zlPdRs,303
|
2
|
-
caption_flow/cli.py,sha256=
|
2
|
+
caption_flow/cli.py,sha256=fkyQHzs5kei6-9ftkbJjko-K67TARxd7yNf7x9e7KSs,28820
|
3
3
|
caption_flow/models.py,sha256=qo6lQiO10UISbaBVr6Cs-fSW_pmjwE6kmiTmmU_l3Wk,2140
|
4
|
-
caption_flow/monitor.py,sha256=
|
5
|
-
caption_flow/orchestrator.py,sha256=
|
6
|
-
caption_flow/storage.py,sha256=
|
4
|
+
caption_flow/monitor.py,sha256=ZZCSasYLKJ-UzA3-RoAtytv-tbNA-m3h5YjlZg_vukg,7870
|
5
|
+
caption_flow/orchestrator.py,sha256=9yWKVcaR-S6naNQSd7Np8AemwV5lNDmB_lCufpvVrS0,96282
|
6
|
+
caption_flow/storage.py,sha256=kGv9iQAgxwLLlAIPU6TBrlagdfxA339eBz1xG0yYRsc,40981
|
7
7
|
caption_flow/utils/__init__.py,sha256=F1BChVoCsj9zn1GJRBOLHET1kLW6xrAmsbzcR7hHy6Y,202
|
8
8
|
caption_flow/utils/auth.py,sha256=UrxX2n8OEEcfMD1Ey27TxGfrJFmUCpC59x-SCrQJoVE,2253
|
9
9
|
caption_flow/utils/caption_utils.py,sha256=esUMAdcCkNjRroZ0Bhxv0_yKlLtMf0XeDCTt-5k6bik,5309
|
10
10
|
caption_flow/utils/certificates.py,sha256=eu4blQZEkL9NRaY1ynQWg1asvDorRYhGRZea7STonJE,4635
|
11
11
|
caption_flow/utils/checkpoint_tracker.py,sha256=8tsTFF-HcygitK92YcS-QWzeg-qRm9AuCpQoQRfC8M0,3335
|
12
|
-
caption_flow/utils/chunk_tracker.py,sha256=
|
13
|
-
caption_flow/utils/dataset_loader.py,sha256=
|
12
|
+
caption_flow/utils/chunk_tracker.py,sha256=SO6ERvEwGXuikGDVaXFota_3Ix8BnePMU7CiZJKBAnQ,18025
|
13
|
+
caption_flow/utils/dataset_loader.py,sha256=Bvo-aa5jWtjzqXW0rEisdiWaN7Q-aH02rXXUu9uXqGo,19194
|
14
14
|
caption_flow/utils/image_processor.py,sha256=Zl8TAv9gYPdAYat3UiTuuNdIb2fXNfZ35AxsxuovJTs,5650
|
15
15
|
caption_flow/utils/job_queue.py,sha256=itdfXcrkvGjmXn4qtpgMF63k1ufRBaejDe4V6WcxzgU,1104
|
16
16
|
caption_flow/utils/json_utils.py,sha256=IiZYn8uCM-3pYmyIbX2fmaOIyutArn67SqAyp0ggNpU,5396
|
17
17
|
caption_flow/utils/prompt_template.py,sha256=AKp0diSZqNBMwZkpiTNjw8-bbQwHStr7QZTOJ7o1dC4,4345
|
18
|
-
caption_flow/utils/shard_processor.py,sha256=
|
18
|
+
caption_flow/utils/shard_processor.py,sha256=c6COBKhFzZyUeJqot5uGVR3ANeOReBfs8-DR27mrdcA,14242
|
19
19
|
caption_flow/utils/shard_tracker.py,sha256=Wt2oE-O85F2FxSnqIocJiaYeFn00OVVjIiklZIZRGL8,3233
|
20
20
|
caption_flow/utils/vllm_config.py,sha256=TC7Rmjk0zRKbBXbWUXrFL4Z58hzax_-4L0pXZn09hdM,6019
|
21
21
|
caption_flow/workers/base.py,sha256=jPm_Xw4Lxd0cnrPs-biBqKRQKkTOJLvHLolmp0Gb1CI,7530
|
22
22
|
caption_flow/workers/caption.py,sha256=NZ9kTjk2uOoNwyyNSkB_arYk213vLr5mowHN-OjiFkk,54631
|
23
23
|
caption_flow/workers/data.py,sha256=0Tg8NE0wdONeMlivYQ4nvbcfWdLuU51O7vR8_YSnJgo,14813
|
24
|
-
caption_flow-0.2.
|
25
|
-
caption_flow-0.2.
|
26
|
-
caption_flow-0.2.
|
27
|
-
caption_flow-0.2.
|
28
|
-
caption_flow-0.2.
|
29
|
-
caption_flow-0.2.
|
24
|
+
caption_flow-0.2.2.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
25
|
+
caption_flow-0.2.2.dist-info/METADATA,sha256=h9VN2ZWXVDH935Eavb-1kfsBpuW7m4Oph3tjh9ucc3w,11941
|
26
|
+
caption_flow-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
27
|
+
caption_flow-0.2.2.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
|
28
|
+
caption_flow-0.2.2.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
|
29
|
+
caption_flow-0.2.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|