vlm-dataset-captioner 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vlm-dataset-captioner
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: Uses a VLM to caption images from a dataset.
5
5
  Author: Alex Senden
6
6
  Maintainer: Alex Senden
@@ -49,7 +49,7 @@ def get_prompt_for_directory(directory_path):
49
49
  f"WARN: Prompt file not found for directory {prompt_file_path}. Using default prompt.",
50
50
  flush=True,
51
51
  )
52
- prompt = "Describe the image in detail."
52
+ prompt = "In one short sentence. The caption will be used for image indexing and search, so include relevant details. 1 sentence only."
53
53
 
54
54
  print(f"INFO: Using prompt: '{prompt}'", flush=True)
55
55
 
@@ -86,6 +86,8 @@ def contains_chinese(text_string):
86
86
  def caption_image(prompt, image, model, processor, max_new_tokens=None):
87
87
  messages = get_messages(prompt, image)
88
88
 
89
+ print(f"INFO: Generating caption for image: {image}.", flush=True)
90
+
89
91
  # Prepare inputs for the model
90
92
  text = processor.apply_chat_template(
91
93
  messages, tokenize=False, add_generation_prompt=True
@@ -129,6 +131,8 @@ def caption_image(prompt, image, model, processor, max_new_tokens=None):
129
131
  clean_up_tokenization_spaces=False,
130
132
  )
131
133
 
134
+ print(f"INFO: Caption generated for image: {image}.", flush=True)
135
+
132
136
  return output_text[0]
133
137
 
134
138
 
@@ -206,15 +210,20 @@ def caption_entire_directory(
206
210
  caption += "\n"
207
211
 
208
212
  while True:
209
- caption += caption_image(
213
+ individual_caption = caption_image(
210
214
  prompt,
211
215
  os.path.join(directory_path, image_file),
212
216
  model,
213
217
  processor,
214
218
  max_new_tokens,
215
219
  )
216
- if not contains_chinese(caption):
220
+ if not contains_chinese(individual_caption):
221
+ caption += individual_caption
217
222
  break
223
+ print(
224
+ "WARN: Detected Chinese characters in caption. Regenerating...",
225
+ flush=True,
226
+ )
218
227
  write_caption_to_file(image_file, caption, output_directory)
219
228
  except Exception as e:
220
229
  print(
@@ -16,7 +16,7 @@ def parse_args():
16
16
  parser.add_argument(
17
17
  "--model",
18
18
  type=str,
19
- default=None,
19
+ default="Qwen/Qwen2.5-VL-32B-Instruct",
20
20
  help="The HuggingFace model used to generate captions.",
21
21
  )
22
22
  parser.add_argument(
@@ -55,8 +55,10 @@ def parse_args():
55
55
  def main():
56
56
  args = parse_args()
57
57
  model, processor = init_model(args.model)
58
-
59
- output_dir = args.output_dir if args.output_dir is not None else f"{args.input_dir}_caption"
58
+
59
+ output_dir = (
60
+ args.output_dir if args.output_dir is not None else f"{args.input_dir}_caption"
61
+ )
60
62
 
61
63
  if args.model is not None:
62
64
  print(f"INFO: Using model {args.model} for captioning.", flush=True)
@@ -71,8 +73,8 @@ def main():
71
73
  caption_entire_directory(
72
74
  args.input_dir,
73
75
  output_dir,
74
- model,
75
- processor,
76
+ model=model,
77
+ processor=processor,
76
78
  max_new_tokens=args.max_length,
77
79
  ignore_substring=args.ignore_substring,
78
80
  num_captions=args.num_captions,