datasety 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,45 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Set up Python
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version: "3.12"
18
+
19
+ - name: Install build dependencies
20
+ run: pip install build
21
+
22
+ - name: Build package
23
+ run: python -m build
24
+
25
+ - name: Upload artifacts
26
+ uses: actions/upload-artifact@v4
27
+ with:
28
+ name: dist
29
+ path: dist/
30
+
31
+ publish:
32
+ needs: build
33
+ runs-on: ubuntu-latest
34
+ environment: pypi
35
+ permissions:
36
+ id-token: write
37
+ steps:
38
+ - name: Download artifacts
39
+ uses: actions/download-artifact@v4
40
+ with:
41
+ name: dist
42
+ path: dist/
43
+
44
+ - name: Publish to PyPI
45
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,46 @@
1
+ name: Test
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ pip install -e .
27
+ pip install pytest
28
+
29
+ - name: Run tests
30
+ run: pytest -v
31
+
32
+ lint:
33
+ runs-on: ubuntu-latest
34
+ steps:
35
+ - uses: actions/checkout@v4
36
+
37
+ - name: Set up Python
38
+ uses: actions/setup-python@v5
39
+ with:
40
+ python-version: "3.12"
41
+
42
+ - name: Install ruff
43
+ run: pip install ruff
44
+
45
+ - name: Run linter
46
+ run: ruff check src/
@@ -0,0 +1,36 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Distribution / packaging
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+ *.egg
11
+
12
+ # Virtual environments
13
+ venv/
14
+ .venv/
15
+ env/
16
+
17
+ # IDE
18
+ .idea/
19
+ .vscode/
20
+ *.swp
21
+ *.swo
22
+
23
+ # Testing
24
+ .pytest_cache/
25
+ .coverage
26
+ htmlcov/
27
+
28
+ # OS
29
+ .DS_Store
30
+ Thumbs.db
31
+
32
+ # Project specific
33
+ *.jpg
34
+ *.jpeg
35
+ *.png
36
+ *.webp
datasety-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,164 @@
1
+ Metadata-Version: 2.4
2
+ Name: datasety
3
+ Version: 0.1.0
4
+ Summary: CLI tool for dataset preparation: image resizing and captioning with Florence-2
5
+ Project-URL: Homepage, https://github.com/kontextox/datasety
6
+ Project-URL: Repository, https://github.com/kontextox/datasety
7
+ Project-URL: Issues, https://github.com/kontextox/datasety/issues
8
+ Author: kontextox
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: captioning,cli,dataset,florence-2,image-processing,machine-learning
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Scientific/Engineering :: Image Processing
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: pillow>=9.0.0
26
+ Provides-Extra: caption
27
+ Requires-Dist: einops; extra == 'caption'
28
+ Requires-Dist: timm; extra == 'caption'
29
+ Requires-Dist: torch>=2.0.0; extra == 'caption'
30
+ Requires-Dist: transformers<4.46.0,>=4.38.0; extra == 'caption'
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
33
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
34
+ Description-Content-Type: text/markdown
35
+
36
+ # datasety
37
+
38
+ CLI tool for dataset preparation: image resizing and captioning with Florence-2.
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ pip install datasety
44
+ ```
45
+
46
+ For captioning support (requires PyTorch and Transformers):
47
+
48
+ ```bash
49
+ pip install datasety[caption]
50
+ ```
51
+
52
+ ## Usage
53
+
54
+ ### Resize Images
55
+
56
+ Resize and crop images to a target resolution:
57
+
58
+ ```bash
59
+ datasety resize --input ./images --output ./resized --resolution 768x1024
60
+ ```
61
+
62
+ **Options:**
63
+
64
+ | Option | Description | Default |
65
+ | ----------------------- | --------------------------------------------------------- | ------------------- |
66
+ | `--input`, `-i` | Input directory | (required) |
67
+ | `--output`, `-o` | Output directory | (required) |
68
+ | `--resolution`, `-r` | Target resolution (WIDTHxHEIGHT) | (required) |
69
+ | `--crop-position` | Crop position: `top`, `center`, `bottom`, `left`, `right` | `center` |
70
+ | `--input-format` | Comma-separated formats | `jpg,jpeg,png,webp` |
71
+ | `--output-format` | Output format: `jpg`, `png`, `webp` | `jpg` |
72
+ | `--output-name-numbers` | Rename files to 1.jpg, 2.jpg, ... | `false` |
73
+
74
+ **Example:**
75
+
76
+ ```bash
77
+ datasety resize \
78
+ --input ./raw_photos \
79
+ --output ./dataset \
80
+ --resolution 1024x1024 \
81
+ --crop-position top \
82
+ --output-format jpg \
83
+ --output-name-numbers
84
+ ```
85
+
86
+ **How it works:**
87
+
88
+ 1. Finds all images matching input formats
89
+ 2. Skips images where either dimension is smaller than target
90
+ 3. Resizes proportionally so the smaller side matches target
91
+ 4. Crops from the specified area to exact dimensions
92
+ 5. Saves with high quality (95% for jpg/webp)
93
+
94
+ ### Generate Captions
95
+
96
+ Generate captions for images using Microsoft's Florence-2 model:
97
+
98
+ ```bash
99
+ datasety caption --input ./images --output ./captions --florence-2-large
100
+ ```
101
+
102
+ **Options:**
103
+
104
+ | Option | Description | Default |
105
+ | -------------------- | ------------------------------- | ------------------------- |
106
+ | `--input`, `-i` | Input directory | (required) |
107
+ | `--output`, `-o` | Output directory for .txt files | (required) |
108
+ | `--device` | `cpu` or `cuda` | `cpu` |
109
+ | `--trigger-word` | Text to prepend to captions | (none) |
110
+ | `--prompt` | Florence-2 task prompt | `<MORE_DETAILED_CAPTION>` |
111
+ | `--florence-2-base` | Use base model (0.23B, faster) | |
112
+ | `--florence-2-large` | Use large model (0.77B, better) | (default) |
113
+
114
+ **Available prompts:**
115
+
116
+ - `<CAPTION>` - Brief caption
117
+ - `<DETAILED_CAPTION>` - Detailed caption
118
+ - `<MORE_DETAILED_CAPTION>` - Most detailed caption (default)
119
+
120
+ **Example:**
121
+
122
+ ```bash
123
+ datasety caption \
124
+ --input ./dataset \
125
+ --output ./dataset \
126
+ --device cuda \
127
+ --trigger-word "photo of sks person," \
128
+ --florence-2-large
129
+ ```
130
+
131
+ This creates a `.txt` file for each image with the generated caption.
132
+
133
+ ## Common Workflows
134
+
135
+ ### Prepare a LoRA Training Dataset
136
+
137
+ ```bash
138
+ # 1. Resize images to 1024x1024
139
+ datasety resize -i ./raw -o ./dataset -r 1024x1024 --crop-position center
140
+
141
+ # 2. Generate captions with trigger word
142
+ datasety caption -i ./dataset -o ./dataset --trigger-word "[trigger]" --device cuda
143
+ ```
144
+
145
+ ### Batch Process with Numbered Files
146
+
147
+ ```bash
148
+ datasety resize \
149
+ -i ./photos \
150
+ -o ./processed \
151
+ -r 768x1024 \
152
+ --output-name-numbers \
153
+ --crop-position top
154
+ ```
155
+
156
+ ## Requirements
157
+
158
+ - Python 3.10+
159
+ - Pillow (for resize)
160
+ - PyTorch + Transformers (for caption, install with `pip install datasety[caption]`)
161
+
162
+ ## License
163
+
164
+ MIT
@@ -0,0 +1,129 @@
1
+ # datasety
2
+
3
+ CLI tool for dataset preparation: image resizing and captioning with Florence-2.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install datasety
9
+ ```
10
+
11
+ For captioning support (requires PyTorch and Transformers):
12
+
13
+ ```bash
14
+ pip install datasety[caption]
15
+ ```
16
+
17
+ ## Usage
18
+
19
+ ### Resize Images
20
+
21
+ Resize and crop images to a target resolution:
22
+
23
+ ```bash
24
+ datasety resize --input ./images --output ./resized --resolution 768x1024
25
+ ```
26
+
27
+ **Options:**
28
+
29
+ | Option | Description | Default |
30
+ | ----------------------- | --------------------------------------------------------- | ------------------- |
31
+ | `--input`, `-i` | Input directory | (required) |
32
+ | `--output`, `-o` | Output directory | (required) |
33
+ | `--resolution`, `-r` | Target resolution (WIDTHxHEIGHT) | (required) |
34
+ | `--crop-position` | Crop position: `top`, `center`, `bottom`, `left`, `right` | `center` |
35
+ | `--input-format` | Comma-separated formats | `jpg,jpeg,png,webp` |
36
+ | `--output-format` | Output format: `jpg`, `png`, `webp` | `jpg` |
37
+ | `--output-name-numbers` | Rename files to 1.jpg, 2.jpg, ... | `false` |
38
+
39
+ **Example:**
40
+
41
+ ```bash
42
+ datasety resize \
43
+ --input ./raw_photos \
44
+ --output ./dataset \
45
+ --resolution 1024x1024 \
46
+ --crop-position top \
47
+ --output-format jpg \
48
+ --output-name-numbers
49
+ ```
50
+
51
+ **How it works:**
52
+
53
+ 1. Finds all images matching input formats
54
+ 2. Skips images where either dimension is smaller than target
55
+ 3. Resizes proportionally so the smaller side matches target
56
+ 4. Crops from the specified area to exact dimensions
57
+ 5. Saves with high quality (95% for jpg/webp)
58
+
59
+ ### Generate Captions
60
+
61
+ Generate captions for images using Microsoft's Florence-2 model:
62
+
63
+ ```bash
64
+ datasety caption --input ./images --output ./captions --florence-2-large
65
+ ```
66
+
67
+ **Options:**
68
+
69
+ | Option | Description | Default |
70
+ | -------------------- | ------------------------------- | ------------------------- |
71
+ | `--input`, `-i` | Input directory | (required) |
72
+ | `--output`, `-o` | Output directory for .txt files | (required) |
73
+ | `--device` | `cpu` or `cuda` | `cpu` |
74
+ | `--trigger-word` | Text to prepend to captions | (none) |
75
+ | `--prompt` | Florence-2 task prompt | `<MORE_DETAILED_CAPTION>` |
76
+ | `--florence-2-base` | Use base model (0.23B, faster) | |
77
+ | `--florence-2-large` | Use large model (0.77B, better) | (default) |
78
+
79
+ **Available prompts:**
80
+
81
+ - `<CAPTION>` - Brief caption
82
+ - `<DETAILED_CAPTION>` - Detailed caption
83
+ - `<MORE_DETAILED_CAPTION>` - Most detailed caption (default)
84
+
85
+ **Example:**
86
+
87
+ ```bash
88
+ datasety caption \
89
+ --input ./dataset \
90
+ --output ./dataset \
91
+ --device cuda \
92
+ --trigger-word "photo of sks person," \
93
+ --florence-2-large
94
+ ```
95
+
96
+ This creates a `.txt` file for each image with the generated caption.
97
+
98
+ ## Common Workflows
99
+
100
+ ### Prepare a LoRA Training Dataset
101
+
102
+ ```bash
103
+ # 1. Resize images to 1024x1024
104
+ datasety resize -i ./raw -o ./dataset -r 1024x1024 --crop-position center
105
+
106
+ # 2. Generate captions with trigger word
107
+ datasety caption -i ./dataset -o ./dataset --trigger-word "[trigger]" --device cuda
108
+ ```
109
+
110
+ ### Batch Process with Numbered Files
111
+
112
+ ```bash
113
+ datasety resize \
114
+ -i ./photos \
115
+ -o ./processed \
116
+ -r 768x1024 \
117
+ --output-name-numbers \
118
+ --crop-position top
119
+ ```
120
+
121
+ ## Requirements
122
+
123
+ - Python 3.10+
124
+ - Pillow (for resize)
125
+ - PyTorch + Transformers (for caption, install with `pip install datasety[caption]`)
126
+
127
+ ## License
128
+
129
+ MIT
@@ -0,0 +1,72 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "datasety"
7
+ dynamic = ["version"]
8
+ description = "CLI tool for dataset preparation: image resizing and captioning with Florence-2"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "kontextox" }
14
+ ]
15
+ keywords = [
16
+ "dataset",
17
+ "image-processing",
18
+ "captioning",
19
+ "florence-2",
20
+ "machine-learning",
21
+ "cli",
22
+ ]
23
+ classifiers = [
24
+ "Development Status :: 4 - Beta",
25
+ "Environment :: Console",
26
+ "Intended Audience :: Developers",
27
+ "Intended Audience :: Science/Research",
28
+ "License :: OSI Approved :: MIT License",
29
+ "Operating System :: OS Independent",
30
+ "Programming Language :: Python :: 3",
31
+ "Programming Language :: Python :: 3.10",
32
+ "Programming Language :: Python :: 3.11",
33
+ "Programming Language :: Python :: 3.12",
34
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
35
+ "Topic :: Scientific/Engineering :: Image Processing",
36
+ ]
37
+ dependencies = [
38
+ "Pillow>=9.0.0",
39
+ ]
40
+
41
+ [project.optional-dependencies]
42
+ caption = [
43
+ "torch>=2.0.0",
44
+ "transformers>=4.38.0,<4.46.0",
45
+ "einops",
46
+ "timm",
47
+ ]
48
+ dev = [
49
+ "pytest>=7.0.0",
50
+ "ruff>=0.1.0",
51
+ ]
52
+
53
+ [project.scripts]
54
+ datasety = "datasety.cli:main"
55
+
56
+ [project.urls]
57
+ Homepage = "https://github.com/kontextox/datasety"
58
+ Repository = "https://github.com/kontextox/datasety"
59
+ Issues = "https://github.com/kontextox/datasety/issues"
60
+
61
+ [tool.hatch.version]
62
+ path = "src/datasety/__init__.py"
63
+
64
+ [tool.hatch.build.targets.wheel]
65
+ packages = ["src/datasety"]
66
+
67
+ [tool.ruff]
68
+ line-length = 100
69
+ target-version = "py310"
70
+
71
+ [tool.ruff.lint]
72
+ select = ["E", "F", "I", "W"]
@@ -0,0 +1,3 @@
1
+ """datasety - CLI tool for dataset preparation: image resizing and captioning."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,6 @@
1
+ """Allow running as `python -m datasety`."""
2
+
3
+ from datasety.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,388 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ datasety - CLI tool for dataset preparation: image resizing and captioning.
4
+
5
+ Usage:
6
+ datasety resize --input ./in --output ./out --resolution 768x1024 --crop-position top
7
+ datasety caption --input ./in --output ./out --trigger-word "[trigger]" --florence-2-large
8
+ """
9
+
10
+ import argparse
11
+ import sys
12
+ from pathlib import Path
13
+
14
+ from PIL import Image
15
+
16
+
17
+ def get_image_files(input_dir: Path, formats: list[str]) -> list[Path]:
18
+ """Find all images matching the specified formats."""
19
+ files = []
20
+ for fmt in formats:
21
+ fmt = fmt.lower().strip()
22
+ files.extend(input_dir.glob(f"*.{fmt}"))
23
+ files.extend(input_dir.glob(f"*.{fmt.upper()}"))
24
+ return sorted(set(files))
25
+
26
+
27
+ def calculate_resize_and_crop(
28
+ orig_width: int, orig_height: int,
29
+ target_width: int, target_height: int,
30
+ crop_position: str
31
+ ) -> tuple[tuple[int, int], tuple[int, int, int, int]]:
32
+ """
33
+ Calculate resize dimensions and crop box.
34
+
35
+ Args:
36
+ crop_position: Where to position the crop window (what to keep).
37
+ 'top' keeps top, 'right' keeps right, etc.
38
+
39
+ Returns:
40
+ (new_width, new_height), (left, top, right, bottom)
41
+ """
42
+ target_ratio = target_width / target_height
43
+ orig_ratio = orig_width / orig_height
44
+
45
+ if orig_ratio > target_ratio:
46
+ # Image is wider - resize by height, crop width
47
+ new_height = target_height
48
+ new_width = int(orig_width * (target_height / orig_height))
49
+ else:
50
+ # Image is taller - resize by width, crop height
51
+ new_width = target_width
52
+ new_height = int(orig_height * (target_width / orig_width))
53
+
54
+ # Calculate crop box based on position (what to keep)
55
+ if crop_position == "center":
56
+ left = (new_width - target_width) // 2
57
+ top = (new_height - target_height) // 2
58
+ elif crop_position == "top":
59
+ left = (new_width - target_width) // 2
60
+ top = 0
61
+ elif crop_position == "bottom":
62
+ left = (new_width - target_width) // 2
63
+ top = new_height - target_height
64
+ elif crop_position == "left":
65
+ left = 0
66
+ top = (new_height - target_height) // 2
67
+ elif crop_position == "right":
68
+ left = new_width - target_width
69
+ top = (new_height - target_height) // 2
70
+ else:
71
+ raise ValueError(f"Invalid crop position: {crop_position}")
72
+
73
+ right = left + target_width
74
+ bottom = top + target_height
75
+
76
+ return (new_width, new_height), (left, top, right, bottom)
77
+
78
+
79
+ def cmd_resize(args):
80
+ """Execute the resize command."""
81
+ input_dir = Path(args.input)
82
+ output_dir = Path(args.output)
83
+
84
+ if not input_dir.exists():
85
+ print(f"Error: Input directory '{input_dir}' does not exist.")
86
+ sys.exit(1)
87
+
88
+ output_dir.mkdir(parents=True, exist_ok=True)
89
+
90
+ # Parse resolution
91
+ try:
92
+ width, height = map(int, args.resolution.lower().split("x"))
93
+ except ValueError:
94
+ print(f"Error: Invalid resolution '{args.resolution}'. Use WIDTHxHEIGHT (e.g., 768x1024)")
95
+ sys.exit(1)
96
+
97
+ # Parse input formats
98
+ formats = [f.strip() for f in args.input_format.split(",")]
99
+
100
+ # Get image files
101
+ image_files = get_image_files(input_dir, formats)
102
+
103
+ if not image_files:
104
+ print(f"No images found in '{input_dir}' with formats: {formats}")
105
+ sys.exit(0)
106
+
107
+ print(f"Found {len(image_files)} images")
108
+ print(f"Target resolution: {width}x{height}")
109
+ print(f"Crop position: {args.crop_position}")
110
+ print(f"Output format: {args.output_format}")
111
+ print("-" * 50)
112
+
113
+ processed = 0
114
+ skipped = 0
115
+
116
+ for idx, img_path in enumerate(image_files, start=1):
117
+ try:
118
+ with Image.open(img_path) as img:
119
+ img = img.convert("RGB")
120
+ orig_w, orig_h = img.size
121
+
122
+ # Skip if image is too small
123
+ if orig_w < width or orig_h < height:
124
+ print(f"[SKIP] {img_path.name}: {orig_w}x{orig_h} < {width}x{height}")
125
+ skipped += 1
126
+ continue
127
+
128
+ # Calculate resize and crop
129
+ (new_w, new_h), crop_box = calculate_resize_and_crop(
130
+ orig_w, orig_h, width, height, args.crop_position
131
+ )
132
+
133
+ # Resize
134
+ img_resized = img.resize((new_w, new_h), Image.LANCZOS)
135
+
136
+ # Crop
137
+ img_cropped = img_resized.crop(crop_box)
138
+
139
+ # Determine output filename
140
+ if args.output_name_numbers:
141
+ out_name = f"{processed + 1}.{args.output_format}"
142
+ else:
143
+ out_name = f"{img_path.stem}.{args.output_format}"
144
+
145
+ out_path = output_dir / out_name
146
+
147
+ # Save with quality settings
148
+ save_kwargs = {}
149
+ if args.output_format.lower() in ("jpg", "jpeg"):
150
+ save_kwargs["quality"] = 95
151
+ save_kwargs["optimize"] = True
152
+ elif args.output_format.lower() == "webp":
153
+ save_kwargs["quality"] = 95
154
+ elif args.output_format.lower() == "png":
155
+ save_kwargs["optimize"] = True
156
+
157
+ img_cropped.save(out_path, **save_kwargs)
158
+
159
+ print(f"[OK] {img_path.name} ({orig_w}x{orig_h}) -> {out_name} ({width}x{height})")
160
+ processed += 1
161
+
162
+ except Exception as e:
163
+ print(f"[ERROR] {img_path.name}: {e}")
164
+ skipped += 1
165
+
166
+ print("-" * 50)
167
+ print(f"Done! Processed: {processed}, Skipped: {skipped}")
168
+
169
+
170
+ def cmd_caption(args):
171
+ """Execute the caption command."""
172
+ # Lazy import for faster CLI startup when not using caption
173
+ try:
174
+ import torch
175
+ from transformers import AutoModelForCausalLM, AutoProcessor
176
+ except ImportError:
177
+ print("Error: Required packages not installed.")
178
+ print("Run: pip install torch transformers")
179
+ sys.exit(1)
180
+
181
+ input_dir = Path(args.input)
182
+ output_dir = Path(args.output)
183
+
184
+ if not input_dir.exists():
185
+ print(f"Error: Input directory '{input_dir}' does not exist.")
186
+ sys.exit(1)
187
+
188
+ output_dir.mkdir(parents=True, exist_ok=True)
189
+
190
+ # Determine model (base flag takes priority since large is default)
191
+ if args.florence_2_base:
192
+ model_name = "microsoft/Florence-2-base"
193
+ else:
194
+ model_name = "microsoft/Florence-2-large"
195
+
196
+ # Determine device
197
+ if args.device == "cuda" and not torch.cuda.is_available():
198
+ print("Warning: CUDA not available, falling back to CPU")
199
+ device = "cpu"
200
+ else:
201
+ device = args.device
202
+
203
+ torch_dtype = torch.float16 if device == "cuda" else torch.float32
204
+
205
+ print(f"Loading model: {model_name}")
206
+ print(f"Device: {device}")
207
+
208
+ try:
209
+ model = AutoModelForCausalLM.from_pretrained(
210
+ model_name,
211
+ torch_dtype=torch_dtype,
212
+ trust_remote_code=True
213
+ ).to(device).eval()
214
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
215
+ except Exception as e:
216
+ print(f"Error loading model: {e}")
217
+ sys.exit(1)
218
+
219
+ # Find images (common formats)
220
+ formats = ["jpg", "jpeg", "png", "webp", "bmp", "tiff"]
221
+ image_files = get_image_files(input_dir, formats)
222
+
223
+ if not image_files:
224
+ print(f"No images found in '{input_dir}'")
225
+ sys.exit(0)
226
+
227
+ print(f"Found {len(image_files)} images")
228
+ print(f"Prompt: {args.prompt}")
229
+ if args.trigger_word:
230
+ print(f"Trigger word: {args.trigger_word}")
231
+ print("-" * 50)
232
+
233
+ processed = 0
234
+
235
+ for img_path in image_files:
236
+ try:
237
+ with Image.open(img_path) as img:
238
+ img = img.convert("RGB")
239
+
240
+ inputs = processor(
241
+ text=args.prompt,
242
+ images=img,
243
+ return_tensors="pt"
244
+ ).to(device, torch_dtype)
245
+
246
+ with torch.no_grad():
247
+ generated_ids = model.generate(
248
+ input_ids=inputs["input_ids"],
249
+ pixel_values=inputs["pixel_values"],
250
+ max_new_tokens=1024,
251
+ num_beams=3,
252
+ do_sample=False
253
+ )
254
+
255
+ generated_text = processor.batch_decode(
256
+ generated_ids, skip_special_tokens=False
257
+ )[0]
258
+
259
+ parsed = processor.post_process_generation(
260
+ generated_text,
261
+ task=args.prompt,
262
+ image_size=(img.width, img.height)
263
+ )
264
+
265
+ caption = parsed.get(args.prompt, "")
266
+
267
+ # Prepend trigger word if specified
268
+ if args.trigger_word:
269
+ caption = f"{args.trigger_word} {caption}"
270
+
271
+ # Save caption
272
+ caption_path = output_dir / f"{img_path.stem}.txt"
273
+ caption_path.write_text(caption.strip())
274
+
275
+ print(f"[OK] {img_path.name}")
276
+ print(f" {caption[:100]}{'...' if len(caption) > 100 else ''}")
277
+ processed += 1
278
+
279
+ except Exception as e:
280
+ print(f"[ERROR] {img_path.name}: {e}")
281
+
282
+ print("-" * 50)
283
+ print(f"Done! Processed: {processed} images")
284
+
285
+
286
+ def main():
287
+ parser = argparse.ArgumentParser(
288
+ prog="datasety",
289
+ description="CLI tool for dataset preparation: image resizing and captioning."
290
+ )
291
+ subparsers = parser.add_subparsers(dest="command", required=True)
292
+
293
+ # === RESIZE command ===
294
+ resize_parser = subparsers.add_parser(
295
+ "resize",
296
+ help="Resize and crop images to target resolution"
297
+ )
298
+ resize_parser.add_argument(
299
+ "--input", "-i",
300
+ required=True,
301
+ help="Input directory containing images"
302
+ )
303
+ resize_parser.add_argument(
304
+ "--output", "-o",
305
+ required=True,
306
+ help="Output directory for processed images"
307
+ )
308
+ resize_parser.add_argument(
309
+ "--resolution", "-r",
310
+ required=True,
311
+ help="Target resolution as WIDTHxHEIGHT (e.g., 768x1024)"
312
+ )
313
+ resize_parser.add_argument(
314
+ "--crop-position",
315
+ choices=["top", "center", "bottom", "left", "right"],
316
+ default="center",
317
+ help="Position to keep when cropping (default: center)"
318
+ )
319
+ resize_parser.add_argument(
320
+ "--input-format",
321
+ default="jpg,jpeg,png,webp",
322
+ help="Comma-separated input formats (default: jpg,jpeg,png,webp)"
323
+ )
324
+ resize_parser.add_argument(
325
+ "--output-format",
326
+ choices=["jpg", "png", "webp"],
327
+ default="jpg",
328
+ help="Output image format (default: jpg)"
329
+ )
330
+ resize_parser.add_argument(
331
+ "--output-name-numbers",
332
+ action="store_true",
333
+ help="Rename output files to sequential numbers (1.jpg, 2.jpg, ...)"
334
+ )
335
+ resize_parser.set_defaults(func=cmd_resize)
336
+
337
+ # === CAPTION command ===
338
+ caption_parser = subparsers.add_parser(
339
+ "caption",
340
+ help="Generate captions for images using Florence-2"
341
+ )
342
+ caption_parser.add_argument(
343
+ "--input", "-i",
344
+ required=True,
345
+ help="Input directory containing images"
346
+ )
347
+ caption_parser.add_argument(
348
+ "--output", "-o",
349
+ required=True,
350
+ help="Output directory for caption text files"
351
+ )
352
+ caption_parser.add_argument(
353
+ "--device",
354
+ choices=["cpu", "cuda"],
355
+ default="cpu",
356
+ help="Device to run model on (default: cpu)"
357
+ )
358
+ caption_parser.add_argument(
359
+ "--trigger-word",
360
+ default="",
361
+ help="Text to prepend to each caption (e.g., '[trigger]' or 'photo,')"
362
+ )
363
+ caption_parser.add_argument(
364
+ "--prompt",
365
+ default="<MORE_DETAILED_CAPTION>",
366
+ help="Florence-2 prompt (default: <MORE_DETAILED_CAPTION>)"
367
+ )
368
+
369
+ model_group = caption_parser.add_mutually_exclusive_group()
370
+ model_group.add_argument(
371
+ "--florence-2-base",
372
+ action="store_true",
373
+ help="Use Florence-2-base model (0.23B params, faster)"
374
+ )
375
+ model_group.add_argument(
376
+ "--florence-2-large",
377
+ action="store_true",
378
+ help="Use Florence-2-large model (0.77B params, more accurate) [default]"
379
+ )
380
+ caption_parser.set_defaults(func=cmd_caption)
381
+
382
+ # Parse and execute
383
+ args = parser.parse_args()
384
+ args.func(args)
385
+
386
+
387
+ if __name__ == "__main__":
388
+ main()
File without changes
File without changes
@@ -0,0 +1,84 @@
1
+ """Tests for the resize command."""
2
+
3
+ import pytest
4
+ from datasety.cli import calculate_resize_and_crop, get_image_files
5
+ from pathlib import Path
6
+
7
+
8
+ class TestCalculateResizeAndCrop:
9
+ """Test resize and crop calculations."""
10
+
11
+ def test_wider_image_center_crop(self):
12
+ """Test cropping a wider image from center."""
13
+ # 2000x1000 image -> 1024x1024 target
14
+ (new_w, new_h), (left, top, right, bottom) = calculate_resize_and_crop(
15
+ 2000, 1000, 1024, 1024, "center"
16
+ )
17
+ assert new_h == 1024
18
+ assert new_w == 2048 # maintains aspect ratio
19
+ assert top == 0
20
+ assert bottom == 1024
21
+ assert left == (2048 - 1024) // 2
22
+ assert right == left + 1024
23
+
24
+ def test_taller_image_center_crop(self):
25
+ """Test cropping a taller image from center."""
26
+ # 1000x2000 image -> 1024x1024 target
27
+ (new_w, new_h), (left, top, right, bottom) = calculate_resize_and_crop(
28
+ 1000, 2000, 1024, 1024, "center"
29
+ )
30
+ assert new_w == 1024
31
+ assert new_h == 2048
32
+ assert left == 0
33
+ assert right == 1024
34
+ assert top == (2048 - 1024) // 2
35
+ assert bottom == top + 1024
36
+
37
+ def test_top_crop(self):
38
+ """Test cropping from top."""
39
+ (new_w, new_h), (left, top, right, bottom) = calculate_resize_and_crop(
40
+ 1000, 2000, 1024, 1024, "top"
41
+ )
42
+ assert top == 0
43
+ assert bottom == 1024
44
+
45
+ def test_bottom_crop(self):
46
+ """Test cropping from bottom."""
47
+ (new_w, new_h), (left, top, right, bottom) = calculate_resize_and_crop(
48
+ 1000, 2000, 1024, 1024, "bottom"
49
+ )
50
+ assert bottom == new_h
51
+ assert top == new_h - 1024
52
+
53
+ def test_left_crop(self):
54
+ """Test cropping from left."""
55
+ (new_w, new_h), (left, top, right, bottom) = calculate_resize_and_crop(
56
+ 2000, 1000, 1024, 1024, "left"
57
+ )
58
+ assert left == 0
59
+ assert right == 1024
60
+
61
+ def test_right_crop(self):
62
+ """Test cropping from right."""
63
+ (new_w, new_h), (left, top, right, bottom) = calculate_resize_and_crop(
64
+ 2000, 1000, 1024, 1024, "right"
65
+ )
66
+ assert right == new_w
67
+ assert left == new_w - 1024
68
+
69
+ def test_non_square_target(self):
70
+ """Test with non-square target resolution."""
71
+ # 2000x1500 image -> 768x1024 target (portrait)
72
+ # orig_ratio=1.33 > target_ratio=0.75, so resize by height
73
+ (new_w, new_h), (left, top, right, bottom) = calculate_resize_and_crop(
74
+ 2000, 1500, 768, 1024, "center"
75
+ )
76
+ assert new_h == 1024
77
+ assert new_w == int(2000 * (1024 / 1500)) # 1365
78
+ assert right - left == 768
79
+ assert bottom - top == 1024
80
+
81
+ def test_invalid_crop_position(self):
82
+ """Test that invalid crop position raises error."""
83
+ with pytest.raises(ValueError):
84
+ calculate_resize_and_crop(1000, 1000, 512, 512, "invalid")