caption-flow 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
caption_flow/storage.py CHANGED
@@ -386,10 +386,15 @@ class StorageManager:
386
386
 
387
387
  # Filter new data to exclude duplicates
388
388
  new_rows = []
389
+ duplicate_rows = []
389
390
  for row in prepared_buffer:
390
391
  if row["job_id"] not in existing_job_ids:
391
392
  new_rows.append(row)
393
+ elif row not in duplicate_rows:
394
+ duplicate_rows.append(row)
392
395
 
396
+ if duplicate_rows:
397
+ logger.info(f"Example duplicate row: {duplicate_rows[0]}")
393
398
  if new_rows:
394
399
  # Create table from new rows only
395
400
  new_table = pa.Table.from_pylist(new_rows, schema=self.caption_schema)
@@ -441,9 +441,27 @@ class ChunkTracker(CheckpointTracker):
441
441
  )
442
442
 
443
443
  def get_chunk_with_unprocessed_items(self, chunk_id: str) -> Optional[Dict[str, Any]]:
444
- """Get chunk info including unprocessed ranges."""
445
- if chunk_id not in self.chunks:
444
+ """Get chunk info with unprocessed item ranges."""
445
+ chunk_state = self.chunks.get(chunk_id)
446
+ if not chunk_state:
446
447
  return None
447
448
 
448
- chunk = self.chunks[chunk_id]
449
- return {"chunk": chunk.to_dict(), "unprocessed_ranges": chunk.get_unprocessed_ranges()}
449
+ # During startup or if no worker is assigned, treat all unprocessed as available
450
+ if not hasattr(self, "_startup_complete"):
451
+ self._startup_complete = False
452
+
453
+ if not self._startup_complete or not chunk_state.assigned_to:
454
+ # Return all unprocessed ranges
455
+ return {
456
+ "chunk_id": chunk_id,
457
+ "unprocessed_ranges": chunk_state.get_unprocessed_ranges(),
458
+ "status": chunk_state.status,
459
+ }
460
+
461
+ # Normal operation - only return ranges not being worked on
462
+ # This would need more complex tracking of which ranges each worker is processing
463
+ return {
464
+ "chunk_id": chunk_id,
465
+ "unprocessed_ranges": chunk_state.get_unprocessed_ranges(),
466
+ "status": chunk_state.status,
467
+ }
@@ -217,200 +217,157 @@ class DatasetLoader:
217
217
  return dataset_path, start_idx, chunk_size
218
218
 
219
219
  def iterate_shard(
220
- self, shard_url: str, processed_keys: Optional[set] = None
220
+ self,
221
+ shard_url: str,
222
+ processed_keys: Optional[set] = None,
223
+ unprocessed_ranges: Optional[List[Tuple[int, int]]] = None,
221
224
  ) -> Generator[Tuple[str, str, bytes], None, None]:
222
225
  """
223
226
  Iterate over items in a shard.
224
227
 
228
+ Args:
229
+ shard_url: URL or identifier of the shard
230
+ processed_keys: Set of already processed keys to skip
231
+ unprocessed_ranges: Specific ranges to process (for HF datasets)
232
+
225
233
  Yields:
226
234
  Tuple of (key, url, image_bytes)
227
235
  """
228
- # Check if this is a virtual HuggingFace dataset shard
229
236
  if shard_url.startswith("hf_dataset:"):
230
- yield from self._iterate_hf_dataset_shard(shard_url, processed_keys)
237
+ raise ValueError(
238
+ "Virtual HuggingFace dataset shards should use iterate_shard_with_metadata()"
239
+ )
231
240
  else:
232
241
  # Regular WebDataset shard
233
242
  ds = self.load_shard(shard_url, processed_keys)
234
243
  for key, url, image_data in ds:
235
244
  yield key, url, image_data
236
245
 
237
- def _iterate_hf_dataset_shard_with_metadata(
238
- self, shard_url: str, processed_keys: Optional[set] = None
239
- ) -> Generator[Tuple[str, str, bytes, Dict[str, Any]], None, None]:
240
- """Iterate over a virtual HuggingFace dataset shard with metadata."""
241
- if processed_keys is None:
242
- processed_keys = set()
243
-
244
- dataset_path, start_idx, chunk_size = self._parse_virtual_shard(shard_url)
245
-
246
- logger.info(
247
- f"Loading HuggingFace dataset with metadata: {dataset_path} (split: {self.split})"
248
- )
249
-
246
+ def _create_dataset_at_position(self, dataset_path: str, split: str, start_idx: int):
247
+ """Create a dataset iterator positioned at start_idx using state_dict if available."""
250
248
  try:
251
249
  # Load dataset in streaming mode
252
250
  dataset = load_dataset(
253
251
  dataset_path,
254
- split=self.split,
252
+ split=split,
255
253
  streaming=True,
256
254
  token=self.token,
257
255
  )
258
256
 
259
- # Skip to start index if needed - CONSISTENT WITH OTHER METHOD
260
- if start_idx > 0:
261
- dataset = dataset.skip(start_idx)
262
-
263
- items_processed = 0
264
-
265
- for item in dataset:
266
- # Stop after processing chunk_size items
267
- if items_processed >= chunk_size:
268
- break
269
-
270
- # Generate a unique key for this item - CONSISTENT FORMAT
271
- key = f"{dataset_path.replace('/', '_')}_{start_idx + items_processed:08d}"
272
-
273
- if key in processed_keys:
274
- items_processed += 1
275
- continue
276
-
257
+ # Check if the dataset supports state_dict (newer versions of datasets library)
258
+ if hasattr(dataset, "load_state_dict") and hasattr(dataset, "state_dict"):
259
+ # Try to use the dataset's native state management
277
260
  try:
278
- # Extract image data
279
- if self.image_column in item:
280
- img_data = item[self.image_column]
281
-
282
- # Process image to bytes
283
- image_bytes = ImageProcessor.process_image_data(img_data)
284
-
285
- if image_bytes:
286
- # Extract all metadata (excluding the image column)
287
- metadata = {k: v for k, v in item.items() if k != self.image_column}
261
+ # Get current state
262
+ state = dataset.state_dict()
263
+
264
+ # Modify the state to skip to start_idx
265
+ if "epoch" in state:
266
+ state["epoch"] = 0
267
+ if "num_examples_since_previous_state" in state:
268
+ state["num_examples_since_previous_state"] = start_idx
269
+
270
+ # For newer datasets with examples_iterable state
271
+ if "examples_iterable" in state:
272
+ if isinstance(state["examples_iterable"], dict):
273
+ if "shard_example_idx" in state["examples_iterable"]:
274
+ state["examples_iterable"]["shard_example_idx"] = start_idx
275
+
276
+ # Load the modified state
277
+ dataset.load_state_dict(state)
278
+ logger.info(f"Positioned dataset at index {start_idx} using state_dict")
279
+ return dataset
280
+ except Exception as e:
281
+ logger.debug(f"Could not use state_dict approach: {e}")
288
282
 
289
- # URL is virtual for HF datasets
290
- url = f"hf://{dataset_path}#{start_idx + items_processed}"
291
- items_processed += 1
292
- yield key, url, image_bytes, metadata
293
- else:
294
- logger.warning(
295
- f"Failed to process image for item at index {start_idx + items_processed}"
296
- )
297
- items_processed += 1
298
- continue
299
- else:
300
- logger.warning(
301
- f"No image column '{self.image_column}' found in item at index {start_idx + items_processed}. "
302
- f"Available columns: {list(item.keys())}"
303
- )
304
- items_processed += 1
283
+ # Fall back to skip() for large skips
284
+ if start_idx > 0:
285
+ logger.info(f"Using skip() to position dataset at index {start_idx}")
286
+ dataset = dataset.skip(start_idx)
305
287
 
306
- except Exception as e:
307
- logger.error(
308
- f"Error processing item at index {start_idx + items_processed}: {e}"
309
- )
310
- items_processed += 1
311
- continue
288
+ return dataset
312
289
 
313
290
  except Exception as e:
314
- logger.error(f"Error loading HuggingFace dataset: {e}")
315
- return
291
+ logger.warning(f"Error creating positioned dataset: {e}")
292
+ return None
316
293
 
317
- def _iterate_hf_dataset_shard(
294
+ def _iterate_hf_dataset_shard_with_metadata(
318
295
  self, shard_url: str, processed_keys: Optional[set] = None
319
- ) -> Generator[Tuple[str, str, bytes], None, None]:
320
- """Iterate over a virtual HuggingFace dataset shard."""
296
+ ) -> Generator[Tuple[str, str, bytes, Dict[str, Any]], None, None]:
297
+ """Iterate over a virtual HuggingFace dataset shard with metadata."""
321
298
  if processed_keys is None:
322
299
  processed_keys = set()
323
300
 
324
301
  dataset_path, start_idx, chunk_size = self._parse_virtual_shard(shard_url)
325
302
 
326
- # IMPORTANT: Check if start_idx is beyond dataset bounds
327
- if self._hf_total_items is not None and start_idx >= self._hf_total_items:
328
- logger.warning(
329
- f"Virtual shard starts at index {start_idx} but dataset only has "
330
- f"{self._hf_total_items} items. Skipping this shard."
331
- )
332
- return
333
-
334
303
  logger.info(
335
- f"Loading HuggingFace dataset in streaming mode: {dataset_path} "
336
- f"(split: {self.split}, start: {start_idx}, chunk_size: {chunk_size})"
304
+ f"Loading HuggingFace dataset with metadata: {dataset_path} (split: {self.split})"
337
305
  )
338
306
 
339
307
  try:
340
- # Load dataset in streaming mode
341
- dataset = load_dataset(
342
- dataset_path,
343
- split=self.split,
344
- streaming=True,
345
- token=self.token,
346
- )
308
+ # For HF datasets, we iterate through the full chunk range
309
+ # The actual range filtering happens in the shard processor
310
+ items_processed = 0
311
+ current_abs_idx = start_idx
312
+
313
+ while items_processed < chunk_size:
314
+ # Create a fresh dataset iterator for each batch
315
+ # This avoids issues with stateful iterators
316
+ batch_size = min(1000, chunk_size - items_processed) # Process in smaller batches
317
+
318
+ dataset = load_dataset(
319
+ dataset_path,
320
+ split=self.split,
321
+ streaming=True,
322
+ token=self.token,
323
+ )
347
324
 
348
- # Use dataset.skip() for efficient skipping
349
- if start_idx > 0:
350
- dataset = dataset.skip(start_idx)
351
- logger.info(f"Skipped to index {start_idx}")
325
+ # Skip to current position
326
+ if current_abs_idx > 0:
327
+ dataset = dataset.skip(current_abs_idx)
352
328
 
353
- items_processed = 0
329
+ batch_processed = 0
330
+ for item in dataset:
331
+ if batch_processed >= batch_size or items_processed >= chunk_size:
332
+ break
354
333
 
355
- # Now enumerate starts from 0 after skip
356
- for item in dataset:
357
- # Stop after processing chunk_size items
358
- if items_processed >= chunk_size:
359
- logger.info(f"Completed chunk: processed {items_processed} items")
360
- break
361
-
362
- # Also stop if we've reached the dataset end
363
- if self._hf_total_items and (start_idx + items_processed) >= self._hf_total_items:
364
- logger.info(
365
- f"Reached dataset end at item {start_idx + items_processed} "
366
- f"(total: {self._hf_total_items})"
367
- )
368
- break
334
+ # Generate key
335
+ key = f"{dataset_path.replace('/', '_')}_{current_abs_idx:08d}"
369
336
 
370
- # Generate a unique key for this item - ensure proper formatting
371
- key = f"{dataset_path.replace('/', '_')}_{start_idx + items_processed:08d}"
337
+ if key in processed_keys:
338
+ current_abs_idx += 1
339
+ batch_processed += 1
340
+ items_processed += 1
341
+ continue
372
342
 
373
- if key in processed_keys:
374
- items_processed += 1
375
- continue
343
+ try:
344
+ if self.image_column in item:
345
+ img_data = item[self.image_column]
346
+ image_bytes = ImageProcessor.process_image_data(img_data)
376
347
 
377
- try:
378
- # Extract image data - check configured column name
379
- if self.image_column in item:
380
- img_data = item[self.image_column]
348
+ if image_bytes:
349
+ metadata = {k: v for k, v in item.items() if k != self.image_column}
350
+ url = f"hf://{dataset_path}#{current_abs_idx}"
381
351
 
382
- # Delegate image processing to ImageProcessor
383
- image_bytes = ImageProcessor.process_image_data(img_data)
352
+ yield key, url, image_bytes, metadata
384
353
 
385
- if image_bytes:
386
- # URL is virtual for HF datasets
387
- url = f"hf://{dataset_path}#{start_idx + items_processed}"
354
+ current_abs_idx += 1
355
+ batch_processed += 1
388
356
  items_processed += 1
389
- yield key, url, image_bytes
390
357
  else:
391
358
  logger.warning(
392
- f"Failed to process image for item at index {start_idx + items_processed}"
359
+ f"No image column '{self.image_column}' at index {current_abs_idx}"
393
360
  )
361
+ current_abs_idx += 1
362
+ batch_processed += 1
394
363
  items_processed += 1
395
- continue
396
- else:
397
- logger.warning(
398
- f"No image column '{self.image_column}' found in item at index {start_idx + items_processed}. "
399
- f"Available columns: {list(item.keys())}"
400
- )
401
- items_processed += 1
402
364
 
403
- except Exception as e:
404
- logger.error(
405
- f"Error processing item at index {start_idx + items_processed}: {e}"
406
- )
407
- items_processed += 1
408
- continue
409
-
410
- logger.info(
411
- f"Virtual shard complete: processed {items_processed} items "
412
- f"(start_idx: {start_idx})"
413
- )
365
+ except Exception as e:
366
+ logger.error(f"Error processing item at index {current_abs_idx}: {e}")
367
+ current_abs_idx += 1
368
+ batch_processed += 1
369
+ items_processed += 1
370
+ continue
414
371
 
415
372
  except Exception as e:
416
373
  logger.error(f"Error loading HuggingFace dataset: {e}")
@@ -7,6 +7,8 @@ from abc import ABC, abstractmethod
7
7
  from pathlib import Path
8
8
  from typing import Generator, Tuple, Optional, Dict, Any
9
9
  from dataclasses import dataclass
10
+ from datasets import load_dataset
11
+ from .image_processor import ImageProcessor
10
12
  from threading import Event
11
13
  import shlex
12
14
 
@@ -108,10 +110,7 @@ class HFDatasetShardProcessor(ShardProcessor):
108
110
  connected: Event,
109
111
  ) -> Generator[Tuple[str, str, bytes, Dict[str, Any]], None, None]:
110
112
  """
111
- Process HuggingFace virtual shard chunk with metadata.
112
-
113
- Yields:
114
- Tuple of (key, url, image_data, metadata)
113
+ Process HuggingFace virtual shard chunk with metadata, range by range.
115
114
  """
116
115
  if not dataset_loader:
117
116
  logger.error("No dataset loader configured for HuggingFace dataset shard")
@@ -121,49 +120,114 @@ class HFDatasetShardProcessor(ShardProcessor):
121
120
  unprocessed_ranges = getattr(chunk, "unprocessed_ranges", [(0, chunk.chunk_size - 1)])
122
121
 
123
122
  logger.info(
124
- f"Processing HF dataset chunk {chunk.chunk_id} with ranges: {unprocessed_ranges}"
123
+ f"Processing HF dataset chunk {chunk.chunk_id} with {len(unprocessed_ranges)} ranges"
125
124
  )
126
125
 
127
- items_processed = 0
128
- current_idx = 0
129
-
130
- # Construct proper virtual shard URL
131
- parts = chunk.shard_url.split("_chunk_")
132
- if len(parts) == 2:
133
- base_path = parts[0]
134
- virtual_shard_url = f"{base_path}:chunk:{chunk.start_index}"
135
- else:
136
- virtual_shard_url = chunk.shard_url
137
-
138
- logger.debug(f"Using virtual shard URL: {virtual_shard_url}")
126
+ items_yielded = 0
139
127
 
140
- # Use the new iterate method that includes metadata
141
- for key, url, image_data, metadata in dataset_loader.iterate_shard_with_metadata(
142
- virtual_shard_url
143
- ):
144
- # Check if we should stop
128
+ # Process each range independently with its own iterator
129
+ for range_start, range_end in unprocessed_ranges:
145
130
  if should_stop.is_set() or not connected.is_set():
146
131
  logger.info(f"Stopping chunk processing early due to disconnect")
147
132
  break
148
133
 
149
- # Check if current index is in any unprocessed range
150
- in_range = any(start <= current_idx <= end for start, end in unprocessed_ranges)
151
-
152
- if not in_range:
153
- current_idx += 1
154
- continue # Skip already processed items
134
+ # Calculate absolute indices for this range
135
+ abs_start = chunk.start_index + range_start
136
+ abs_end = chunk.start_index + range_end
137
+ range_size = range_end - range_start + 1
155
138
 
156
- # Check if we've processed enough for this chunk
157
- if current_idx >= chunk.chunk_size:
158
- break
139
+ logger.debug(
140
+ f"Processing range [{range_start}, {range_end}] "
141
+ f"(absolute: [{abs_start}, {abs_end}])"
142
+ )
159
143
 
160
- items_processed += 1
161
- current_idx += 1
162
- yield key, url, image_data, metadata
144
+ try:
145
+ # Create a fresh dataset iterator for this range
146
+ dataset = load_dataset(
147
+ dataset_loader.dataset_path,
148
+ split=dataset_loader.split,
149
+ streaming=True,
150
+ token=dataset_loader.token,
151
+ )
152
+
153
+ # Use state_dict if available for efficient positioning
154
+ if hasattr(dataset, "load_state_dict") and hasattr(dataset, "state_dict"):
155
+ try:
156
+ state = dataset.state_dict()
157
+ # Modify state to jump to abs_start
158
+ if "num_examples_since_previous_state" in state:
159
+ state["num_examples_since_previous_state"] = abs_start
160
+ if "examples_iterable" in state and isinstance(
161
+ state["examples_iterable"], dict
162
+ ):
163
+ if "shard_example_idx" in state["examples_iterable"]:
164
+ state["examples_iterable"]["shard_example_idx"] = abs_start
165
+ dataset.load_state_dict(state)
166
+ logger.debug(f"Positioned dataset at index {abs_start} using state_dict")
167
+ except Exception as e:
168
+ logger.debug(f"Could not use state_dict, falling back to skip: {e}")
169
+ dataset = dataset.skip(abs_start)
170
+ else:
171
+ # Fall back to skip
172
+ dataset = dataset.skip(abs_start)
173
+
174
+ # Process items in this range
175
+ range_items = 0
176
+ for item in dataset:
177
+ if range_items >= range_size:
178
+ break
179
+
180
+ if should_stop.is_set() or not connected.is_set():
181
+ break
182
+
183
+ # Generate key for this item
184
+ current_abs_idx = abs_start + range_items
185
+ key = f"{dataset_loader.dataset_path.replace('/', '_')}_{current_abs_idx:08d}"
186
+
187
+ try:
188
+ if dataset_loader.image_column in item:
189
+ img_data = item[dataset_loader.image_column]
190
+ image_bytes = ImageProcessor.process_image_data(img_data)
191
+
192
+ if image_bytes:
193
+ # Extract metadata
194
+ metadata = {
195
+ k: v
196
+ for k, v in item.items()
197
+ if k != dataset_loader.image_column
198
+ }
199
+ # Add chunk-relative index to metadata
200
+ metadata["_chunk_relative_index"] = range_start + range_items
201
+
202
+ url = f"hf://{dataset_loader.dataset_path}#{current_abs_idx}"
203
+
204
+ items_yielded += 1
205
+ range_items += 1
206
+
207
+ yield key, url, image_bytes, metadata
208
+ else:
209
+ logger.warning(
210
+ f"Failed to process image at index {current_abs_idx}"
211
+ )
212
+ range_items += 1
213
+ else:
214
+ logger.warning(
215
+ f"No image column '{dataset_loader.image_column}' at index {current_abs_idx}"
216
+ )
217
+ range_items += 1
218
+
219
+ except Exception as e:
220
+ logger.error(f"Error processing item at index {current_abs_idx}: {e}")
221
+ range_items += 1
222
+ continue
223
+
224
+ except Exception as e:
225
+ logger.error(f"Error processing range [{range_start}, {range_end}]: {e}")
226
+ continue
163
227
 
164
228
  logger.info(
165
- f"HF dataset chunk {chunk.chunk_id}: yielded {items_processed} items "
166
- f"from ranges {unprocessed_ranges}"
229
+ f"HF dataset chunk {chunk.chunk_id}: yielded {items_yielded} items "
230
+ f"from {len(unprocessed_ranges)} ranges"
167
231
  )
168
232
 
169
233
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: caption-flow
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Self-contained distributed community captioning system
5
5
  Author-email: bghira <bghira@users.github.com>
6
6
  License: MIT
@@ -32,6 +32,7 @@ Requires-Dist: pandas<3.0.0,>=2.3.1
32
32
  Requires-Dist: arrow<2.0.0,>=1.3.0
33
33
  Requires-Dist: datasets<5.0.0,>=4.0.0
34
34
  Requires-Dist: boto3<2.0.0,>=1.40.11
35
+ Requires-Dist: torchdata<0.12.0,>=0.11.0
35
36
  Provides-Extra: dev
36
37
  Requires-Dist: pytest>=7.4.0; extra == "dev"
37
38
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -1,29 +1,29 @@
1
1
  caption_flow/__init__.py,sha256=NLPJ25lRN7xHqncXweINDNwbt0q8lgjZ30G21zlPdRs,303
2
- caption_flow/cli.py,sha256=DVVN4e4uL0jL0gRTaIC5BL0DBU2IU_2yUOi4lg6-lEw,28639
2
+ caption_flow/cli.py,sha256=fkyQHzs5kei6-9ftkbJjko-K67TARxd7yNf7x9e7KSs,28820
3
3
  caption_flow/models.py,sha256=qo6lQiO10UISbaBVr6Cs-fSW_pmjwE6kmiTmmU_l3Wk,2140
4
- caption_flow/monitor.py,sha256=MltOwBqcFwni1XEPWu5dIO-os5NKDbH_LInOBXUWHAY,7870
5
- caption_flow/orchestrator.py,sha256=vLW_w5KuRn9Asy_343DxZDRxiUs0xYgbfuuNGgqIf7k,76403
6
- caption_flow/storage.py,sha256=hC6ZHT_PHFoUVjqD5JUwy3_79oAD1e1H30neA_xsz7s,40748
4
+ caption_flow/monitor.py,sha256=ZZCSasYLKJ-UzA3-RoAtytv-tbNA-m3h5YjlZg_vukg,7870
5
+ caption_flow/orchestrator.py,sha256=9yWKVcaR-S6naNQSd7Np8AemwV5lNDmB_lCufpvVrS0,96282
6
+ caption_flow/storage.py,sha256=kGv9iQAgxwLLlAIPU6TBrlagdfxA339eBz1xG0yYRsc,40981
7
7
  caption_flow/utils/__init__.py,sha256=F1BChVoCsj9zn1GJRBOLHET1kLW6xrAmsbzcR7hHy6Y,202
8
8
  caption_flow/utils/auth.py,sha256=UrxX2n8OEEcfMD1Ey27TxGfrJFmUCpC59x-SCrQJoVE,2253
9
9
  caption_flow/utils/caption_utils.py,sha256=esUMAdcCkNjRroZ0Bhxv0_yKlLtMf0XeDCTt-5k6bik,5309
10
10
  caption_flow/utils/certificates.py,sha256=eu4blQZEkL9NRaY1ynQWg1asvDorRYhGRZea7STonJE,4635
11
11
  caption_flow/utils/checkpoint_tracker.py,sha256=8tsTFF-HcygitK92YcS-QWzeg-qRm9AuCpQoQRfC8M0,3335
12
- caption_flow/utils/chunk_tracker.py,sha256=hKn8CN6ubErc9kuCWZMj12ZCZKxVlqXqAEocbzjfa-k,17296
13
- caption_flow/utils/dataset_loader.py,sha256=qjoRuPnCv_2nGPfrdqf45AgBXlthw1HwqZ1IqwIXzH4,20792
12
+ caption_flow/utils/chunk_tracker.py,sha256=SO6ERvEwGXuikGDVaXFota_3Ix8BnePMU7CiZJKBAnQ,18025
13
+ caption_flow/utils/dataset_loader.py,sha256=Bvo-aa5jWtjzqXW0rEisdiWaN7Q-aH02rXXUu9uXqGo,19194
14
14
  caption_flow/utils/image_processor.py,sha256=Zl8TAv9gYPdAYat3UiTuuNdIb2fXNfZ35AxsxuovJTs,5650
15
15
  caption_flow/utils/job_queue.py,sha256=itdfXcrkvGjmXn4qtpgMF63k1ufRBaejDe4V6WcxzgU,1104
16
16
  caption_flow/utils/json_utils.py,sha256=IiZYn8uCM-3pYmyIbX2fmaOIyutArn67SqAyp0ggNpU,5396
17
17
  caption_flow/utils/prompt_template.py,sha256=AKp0diSZqNBMwZkpiTNjw8-bbQwHStr7QZTOJ7o1dC4,4345
18
- caption_flow/utils/shard_processor.py,sha256=CRda6M4xh4U0vwvYlzq9nJEzz4d_4yzUBosYAeBcPEA,10854
18
+ caption_flow/utils/shard_processor.py,sha256=c6COBKhFzZyUeJqot5uGVR3ANeOReBfs8-DR27mrdcA,14242
19
19
  caption_flow/utils/shard_tracker.py,sha256=Wt2oE-O85F2FxSnqIocJiaYeFn00OVVjIiklZIZRGL8,3233
20
20
  caption_flow/utils/vllm_config.py,sha256=TC7Rmjk0zRKbBXbWUXrFL4Z58hzax_-4L0pXZn09hdM,6019
21
21
  caption_flow/workers/base.py,sha256=jPm_Xw4Lxd0cnrPs-biBqKRQKkTOJLvHLolmp0Gb1CI,7530
22
22
  caption_flow/workers/caption.py,sha256=NZ9kTjk2uOoNwyyNSkB_arYk213vLr5mowHN-OjiFkk,54631
23
23
  caption_flow/workers/data.py,sha256=0Tg8NE0wdONeMlivYQ4nvbcfWdLuU51O7vR8_YSnJgo,14813
24
- caption_flow-0.2.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
25
- caption_flow-0.2.0.dist-info/METADATA,sha256=6qwt05U0S23Omjz1yR6VzLq_wRHbRx_xl3YzhwHyDLc,11900
26
- caption_flow-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
- caption_flow-0.2.0.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
28
- caption_flow-0.2.0.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
29
- caption_flow-0.2.0.dist-info/RECORD,,
24
+ caption_flow-0.2.2.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
25
+ caption_flow-0.2.2.dist-info/METADATA,sha256=h9VN2ZWXVDH935Eavb-1kfsBpuW7m4Oph3tjh9ucc3w,11941
26
+ caption_flow-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
27
+ caption_flow-0.2.2.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
28
+ caption_flow-0.2.2.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
29
+ caption_flow-0.2.2.dist-info/RECORD,,