dataset-toolkit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataset_toolkit/__init__.py +79 -0
- dataset_toolkit/exporters/__init__.py +0 -0
- dataset_toolkit/exporters/coco_exporter.py +64 -0
- dataset_toolkit/exporters/txt_exporter.py +32 -0
- dataset_toolkit/loaders/__init__.py +0 -0
- dataset_toolkit/loaders/local_loader.py +64 -0
- dataset_toolkit/models.py +26 -0
- dataset_toolkit/pipeline.py +176 -0
- dataset_toolkit/processors/__init__.py +0 -0
- dataset_toolkit/processors/merger.py +62 -0
- dataset_toolkit/utils/__init__.py +0 -0
- dataset_toolkit/utils/coords.py +15 -0
- dataset_toolkit-0.1.0.dist-info/METADATA +273 -0
- dataset_toolkit-0.1.0.dist-info/RECORD +17 -0
- dataset_toolkit-0.1.0.dist-info/WHEEL +5 -0
- dataset_toolkit-0.1.0.dist-info/licenses/LICENSE +22 -0
- dataset_toolkit-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
"""
|
2
|
+
Dataset Toolkit - 计算机视觉数据集处理工具包
|
3
|
+
|
4
|
+
这个工具包提供了一套完整的解决方案,用于加载、处理和导出计算机视觉数据集。
|
5
|
+
|
6
|
+
主要功能:
|
7
|
+
- 加载多种格式的数据集(YOLO、COCO等)
|
8
|
+
- 合并和转换数据集
|
9
|
+
- 导出为标准格式
|
10
|
+
- 坐标转换等工具函数
|
11
|
+
|
12
|
+
基本用法:
|
13
|
+
>>> from dataset_toolkit import load_yolo_from_local, export_to_coco
|
14
|
+
>>> dataset = load_yolo_from_local("/path/to/dataset", {0: 'cat'})
|
15
|
+
>>> export_to_coco(dataset, "output.json")
|
16
|
+
"""
|
17
|
+
|
18
|
+
__version__ = "0.1.0"
|
19
|
+
__author__ = "Your Name"
|
20
|
+
__email__ = "your.email@example.com"
|
21
|
+
|
22
|
+
# 导入核心类和函数,提供简洁的顶层API
|
23
|
+
from dataset_toolkit.models import (
|
24
|
+
Dataset,
|
25
|
+
ImageAnnotation,
|
26
|
+
Annotation
|
27
|
+
)
|
28
|
+
|
29
|
+
from dataset_toolkit.loaders.local_loader import (
|
30
|
+
load_yolo_from_local
|
31
|
+
)
|
32
|
+
|
33
|
+
from dataset_toolkit.processors.merger import (
|
34
|
+
merge_datasets
|
35
|
+
)
|
36
|
+
|
37
|
+
from dataset_toolkit.exporters.coco_exporter import (
|
38
|
+
export_to_coco
|
39
|
+
)
|
40
|
+
|
41
|
+
from dataset_toolkit.exporters.txt_exporter import (
|
42
|
+
export_to_txt
|
43
|
+
)
|
44
|
+
|
45
|
+
from dataset_toolkit.utils.coords import (
|
46
|
+
yolo_to_absolute_bbox
|
47
|
+
)
|
48
|
+
|
49
|
+
from dataset_toolkit.pipeline import (
|
50
|
+
DatasetPipeline
|
51
|
+
)
|
52
|
+
|
53
|
+
# 定义公共API
|
54
|
+
__all__ = [
|
55
|
+
# 版本信息
|
56
|
+
"__version__",
|
57
|
+
|
58
|
+
# 数据模型
|
59
|
+
"Dataset",
|
60
|
+
"ImageAnnotation",
|
61
|
+
"Annotation",
|
62
|
+
|
63
|
+
# 加载器
|
64
|
+
"load_yolo_from_local",
|
65
|
+
|
66
|
+
# 处理器
|
67
|
+
"merge_datasets",
|
68
|
+
|
69
|
+
# 导出器
|
70
|
+
"export_to_coco",
|
71
|
+
"export_to_txt",
|
72
|
+
|
73
|
+
# 工具函数
|
74
|
+
"yolo_to_absolute_bbox",
|
75
|
+
|
76
|
+
# 管道API
|
77
|
+
"DatasetPipeline",
|
78
|
+
]
|
79
|
+
|
File without changes
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# dataset_toolkit/exporters/coco_exporter.py
|
2
|
+
import json
|
3
|
+
import datetime
|
4
|
+
from dataset_toolkit.models import Dataset
|
5
|
+
|
6
|
+
def export_to_coco(dataset: Dataset, output_path: str):
|
7
|
+
"""
|
8
|
+
将数据集对象导出为COCO JSON格式。
|
9
|
+
|
10
|
+
Args:
|
11
|
+
dataset (Dataset): 内部标准数据集对象。
|
12
|
+
output_path (str): .json文件的输出路径。
|
13
|
+
"""
|
14
|
+
coco_format = {
|
15
|
+
"info": {
|
16
|
+
"description": f"Exported from dataset_toolkit: {dataset.name}",
|
17
|
+
"date_created": datetime.datetime.utcnow().isoformat()
|
18
|
+
},
|
19
|
+
"licenses": [],
|
20
|
+
"images": [],
|
21
|
+
"annotations": [],
|
22
|
+
"categories": []
|
23
|
+
}
|
24
|
+
|
25
|
+
# 1. 填充 categories
|
26
|
+
for cat_id, cat_name in dataset.categories.items():
|
27
|
+
coco_format["categories"].append({
|
28
|
+
"id": cat_id,
|
29
|
+
"name": cat_name,
|
30
|
+
"supercategory": "none"
|
31
|
+
})
|
32
|
+
|
33
|
+
# 2. 遍历图片和标注,填充 images 和 annotations
|
34
|
+
annotation_id_counter = 1
|
35
|
+
for image_id_counter, image_ann in enumerate(dataset.images, 1):
|
36
|
+
# 添加 image 条目
|
37
|
+
coco_format["images"].append({
|
38
|
+
"id": image_id_counter,
|
39
|
+
"file_name": image_ann.image_id, # 使用原始文件名
|
40
|
+
"width": image_ann.width,
|
41
|
+
"height": image_ann.height
|
42
|
+
})
|
43
|
+
|
44
|
+
# 添加 annotation 条目
|
45
|
+
for ann in image_ann.annotations:
|
46
|
+
x_min, y_min, width, height = ann.bbox
|
47
|
+
area = width * height
|
48
|
+
|
49
|
+
coco_format["annotations"].append({
|
50
|
+
"id": annotation_id_counter,
|
51
|
+
"image_id": image_id_counter,
|
52
|
+
"category_id": ann.category_id,
|
53
|
+
"bbox": [round(c, 2) for c in ann.bbox],
|
54
|
+
"area": round(area, 2),
|
55
|
+
"iscrowd": 0,
|
56
|
+
"segmentation": [] # 对于bbox,segmentation通常为空
|
57
|
+
})
|
58
|
+
annotation_id_counter += 1
|
59
|
+
|
60
|
+
# 3. 写入JSON文件
|
61
|
+
with open(output_path, 'w') as f:
|
62
|
+
json.dump(coco_format, f, indent=4)
|
63
|
+
|
64
|
+
print(f"成功将数据集导出为COCO格式: {output_path}")
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# dataset_toolkit/exporters/txt_exporter.py
|
2
|
+
import os
|
3
|
+
from dataset_toolkit.models import Dataset
|
4
|
+
|
5
|
+
def export_to_txt(dataset: Dataset, output_path: str, use_relative_paths: bool = False, base_path: str = None):
|
6
|
+
"""
|
7
|
+
将数据集导出为每行一条记录的TXT文件。
|
8
|
+
格式: image_path class_id,x_min,y_min,x_max,y_max class_id,x_min,y_min,x_max,y_max ...
|
9
|
+
|
10
|
+
Args:
|
11
|
+
dataset (Dataset): 内部标准数据集对象。
|
12
|
+
output_path (str): .txt文件的输出路径。
|
13
|
+
use_relative_paths (bool): 是否使用相对路径。
|
14
|
+
base_path (str): 计算相对路径时的基准目录。如果为None,则使用当前工作目录。
|
15
|
+
"""
|
16
|
+
lines = []
|
17
|
+
for image_ann in dataset.images:
|
18
|
+
path = image_ann.path
|
19
|
+
if use_relative_paths:
|
20
|
+
try:
|
21
|
+
path = os.path.relpath(path, start=base_path)
|
22
|
+
except ValueError:
|
23
|
+
# 在Windows上,如果路径在不同磁盘驱动器上,会引发ValueError
|
24
|
+
pass # 保持绝对路径
|
25
|
+
|
26
|
+
line = f"{path}"
|
27
|
+
lines.append(line)
|
28
|
+
|
29
|
+
with open(output_path, 'w') as f:
|
30
|
+
f.write('\n'.join(lines))
|
31
|
+
|
32
|
+
print(f"成功将数据集导出为TXT格式: {output_path}")
|
File without changes
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# dataset_toolkit/loaders/local_loader.py
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Dict
|
4
|
+
from PIL import Image
|
5
|
+
|
6
|
+
# 从我们自己的包中导入模块
|
7
|
+
from dataset_toolkit.models import Dataset, ImageAnnotation, Annotation
|
8
|
+
from dataset_toolkit.utils.coords import yolo_to_absolute_bbox
|
9
|
+
|
10
|
+
def load_yolo_from_local(dataset_path: str, categories: Dict[int, str]) -> Dataset:
|
11
|
+
"""
|
12
|
+
从本地文件系统加载YOLO格式的数据集。
|
13
|
+
"""
|
14
|
+
root_path = Path(dataset_path)
|
15
|
+
image_dir = root_path / 'images'
|
16
|
+
label_dir = root_path / 'labels'
|
17
|
+
|
18
|
+
if not image_dir.is_dir():
|
19
|
+
raise FileNotFoundError(f"图片目录不存在: {image_dir}")
|
20
|
+
if not label_dir.is_dir():
|
21
|
+
raise FileNotFoundError(f"标注目录不存在: {label_dir}")
|
22
|
+
|
23
|
+
dataset = Dataset(name=root_path.name, categories=categories)
|
24
|
+
supported_extensions = ['.jpg', '.jpeg', '.png']
|
25
|
+
|
26
|
+
print(f"开始加载数据集: {root_path.name}...")
|
27
|
+
|
28
|
+
for image_path in image_dir.iterdir():
|
29
|
+
if image_path.suffix.lower() not in supported_extensions:
|
30
|
+
continue
|
31
|
+
|
32
|
+
try:
|
33
|
+
with Image.open(image_path) as img:
|
34
|
+
img_width, img_height = img.size
|
35
|
+
except IOError:
|
36
|
+
print(f"警告: 无法打开图片,已跳过: {image_path}")
|
37
|
+
continue
|
38
|
+
image_annotation = ImageAnnotation(
|
39
|
+
image_id=image_path.name,
|
40
|
+
path=str(image_path.resolve()),
|
41
|
+
width=img_width,
|
42
|
+
height=img_height
|
43
|
+
)
|
44
|
+
|
45
|
+
label_path = label_dir / (image_path.stem + '.txt')
|
46
|
+
if label_path.exists():
|
47
|
+
with open(label_path, 'r') as f:
|
48
|
+
for line in f:
|
49
|
+
try:
|
50
|
+
parts = [float(p) for p in line.strip().split()]
|
51
|
+
if len(parts) != 5: continue
|
52
|
+
|
53
|
+
cls_id, yolo_box = int(parts[0]), parts[1:]
|
54
|
+
abs_bbox = yolo_to_absolute_bbox(tuple(yolo_box), img_width, img_height)
|
55
|
+
|
56
|
+
annotation = Annotation(category_id=cls_id, bbox=abs_bbox)
|
57
|
+
image_annotation.annotations.append(annotation)
|
58
|
+
except (ValueError, IndexError):
|
59
|
+
print(f"警告: 无法解析行,已跳过: {label_path} -> '{line.strip()}'")
|
60
|
+
|
61
|
+
dataset.images.append(image_annotation)
|
62
|
+
|
63
|
+
print(f"加载完成. 共找到 {len(dataset.images)} 张图片.")
|
64
|
+
return dataset
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# dataset_toolkit/models.py
|
2
|
+
from dataclasses import dataclass, field
|
3
|
+
from typing import List, Dict
|
4
|
+
|
5
|
+
@dataclass
|
6
|
+
class Annotation:
|
7
|
+
"""代表一个边界框标注."""
|
8
|
+
category_id: int
|
9
|
+
# 存储格式为 [x_min, y_min, width, height],单位是绝对像素值
|
10
|
+
bbox: List[float]
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class ImageAnnotation:
|
14
|
+
"""代表一张图片及其所有相关的标注信息."""
|
15
|
+
image_id: str
|
16
|
+
path: str
|
17
|
+
width: int
|
18
|
+
height: int
|
19
|
+
annotations: List[Annotation] = field(default_factory=list)
|
20
|
+
|
21
|
+
@dataclass
|
22
|
+
class Dataset:
|
23
|
+
"""代表一个完整的数据集对象,作为系统内部的标准化表示."""
|
24
|
+
name: str
|
25
|
+
images: List[ImageAnnotation] = field(default_factory=list)
|
26
|
+
categories: Dict[int, str] = field(default_factory=dict)
|
@@ -0,0 +1,176 @@
|
|
1
|
+
# dataset_toolkit/pipeline.py
|
2
|
+
"""
|
3
|
+
提供链式API,让数据集处理更加流畅和优雅
|
4
|
+
"""
|
5
|
+
from typing import Dict, List, Optional
|
6
|
+
from pathlib import Path
|
7
|
+
from dataset_toolkit.models import Dataset
|
8
|
+
from dataset_toolkit.loaders.local_loader import load_yolo_from_local
|
9
|
+
from dataset_toolkit.processors.merger import merge_datasets
|
10
|
+
from dataset_toolkit.exporters.coco_exporter import export_to_coco
|
11
|
+
from dataset_toolkit.exporters.txt_exporter import export_to_txt
|
12
|
+
|
13
|
+
|
14
|
+
class DatasetPipeline:
|
15
|
+
"""
|
16
|
+
数据集处理管道,支持链式调用
|
17
|
+
|
18
|
+
示例:
|
19
|
+
>>> pipeline = DatasetPipeline()
|
20
|
+
>>> result = (pipeline
|
21
|
+
... .load_yolo("/path/to/dataset1", {0: 'cat'})
|
22
|
+
... .load_yolo("/path/to/dataset2", {0: 'dog'})
|
23
|
+
... .merge(
|
24
|
+
... category_mapping={'cat': 'animal', 'dog': 'animal'},
|
25
|
+
... final_categories={0: 'animal'}
|
26
|
+
... )
|
27
|
+
... .export_coco("output.json")
|
28
|
+
... .get_result())
|
29
|
+
"""
|
30
|
+
|
31
|
+
def __init__(self):
|
32
|
+
self._datasets: List[Dataset] = []
|
33
|
+
self._current_dataset: Optional[Dataset] = None
|
34
|
+
self._operations = []
|
35
|
+
|
36
|
+
def load_yolo(self, dataset_path: str, categories: Dict[int, str]) -> 'DatasetPipeline':
|
37
|
+
"""
|
38
|
+
加载YOLO格式数据集
|
39
|
+
|
40
|
+
Args:
|
41
|
+
dataset_path: 数据集路径
|
42
|
+
categories: 类别映射
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
self: 返回自身以支持链式调用
|
46
|
+
"""
|
47
|
+
dataset = load_yolo_from_local(dataset_path, categories)
|
48
|
+
self._datasets.append(dataset)
|
49
|
+
self._current_dataset = dataset
|
50
|
+
self._operations.append(f"加载数据集: {dataset_path}")
|
51
|
+
return self
|
52
|
+
|
53
|
+
def merge(
|
54
|
+
self,
|
55
|
+
category_mapping: Dict[str, str],
|
56
|
+
final_categories: Dict[int, str],
|
57
|
+
new_dataset_name: str = "merged_dataset"
|
58
|
+
) -> 'DatasetPipeline':
|
59
|
+
"""
|
60
|
+
合并已加载的所有数据集
|
61
|
+
|
62
|
+
Args:
|
63
|
+
category_mapping: 类别映射规则
|
64
|
+
final_categories: 最终类别体系
|
65
|
+
new_dataset_name: 合并后的数据集名称
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
self: 返回自身以支持链式调用
|
69
|
+
"""
|
70
|
+
if len(self._datasets) < 1:
|
71
|
+
raise ValueError("至少需要一个数据集才能执行合并操作")
|
72
|
+
|
73
|
+
self._current_dataset = merge_datasets(
|
74
|
+
datasets=self._datasets,
|
75
|
+
category_mapping=category_mapping,
|
76
|
+
final_categories=final_categories,
|
77
|
+
new_dataset_name=new_dataset_name
|
78
|
+
)
|
79
|
+
self._operations.append(f"合并 {len(self._datasets)} 个数据集")
|
80
|
+
return self
|
81
|
+
|
82
|
+
def export_coco(self, output_path: str) -> 'DatasetPipeline':
|
83
|
+
"""
|
84
|
+
导出为COCO格式
|
85
|
+
|
86
|
+
Args:
|
87
|
+
output_path: 输出文件路径
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
self: 返回自身以支持链式调用
|
91
|
+
"""
|
92
|
+
if self._current_dataset is None:
|
93
|
+
raise ValueError("没有可导出的数据集")
|
94
|
+
|
95
|
+
export_to_coco(self._current_dataset, output_path)
|
96
|
+
self._operations.append(f"导出COCO格式: {output_path}")
|
97
|
+
return self
|
98
|
+
|
99
|
+
def export_txt(
|
100
|
+
self,
|
101
|
+
output_path: str,
|
102
|
+
use_relative_paths: bool = False,
|
103
|
+
base_path: Optional[str] = None
|
104
|
+
) -> 'DatasetPipeline':
|
105
|
+
"""
|
106
|
+
导出为TXT格式
|
107
|
+
|
108
|
+
Args:
|
109
|
+
output_path: 输出文件路径
|
110
|
+
use_relative_paths: 是否使用相对路径
|
111
|
+
base_path: 相对路径的基准目录
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
self: 返回自身以支持链式调用
|
115
|
+
"""
|
116
|
+
if self._current_dataset is None:
|
117
|
+
raise ValueError("没有可导出的数据集")
|
118
|
+
|
119
|
+
export_to_txt(self._current_dataset, output_path, use_relative_paths, base_path)
|
120
|
+
self._operations.append(f"导出TXT格式: {output_path}")
|
121
|
+
return self
|
122
|
+
|
123
|
+
def filter_by_category(self, category_ids: List[int]) -> 'DatasetPipeline':
|
124
|
+
"""
|
125
|
+
按类别过滤数据集(未来功能)
|
126
|
+
|
127
|
+
Args:
|
128
|
+
category_ids: 要保留的类别ID列表
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
self: 返回自身以支持链式调用
|
132
|
+
"""
|
133
|
+
# TODO: 实现类别过滤功能
|
134
|
+
raise NotImplementedError("此功能尚未实现")
|
135
|
+
|
136
|
+
def get_result(self) -> Dataset:
|
137
|
+
"""
|
138
|
+
获取当前处理结果
|
139
|
+
|
140
|
+
Returns:
|
141
|
+
Dataset: 当前的数据集对象
|
142
|
+
"""
|
143
|
+
if self._current_dataset is None:
|
144
|
+
raise ValueError("管道中没有任何数据集")
|
145
|
+
return self._current_dataset
|
146
|
+
|
147
|
+
def get_summary(self) -> str:
|
148
|
+
"""
|
149
|
+
获取管道操作摘要
|
150
|
+
|
151
|
+
Returns:
|
152
|
+
str: 操作摘要信息
|
153
|
+
"""
|
154
|
+
summary = "数据集处理管道操作摘要:\n"
|
155
|
+
summary += "\n".join(f"{i+1}. {op}" for i, op in enumerate(self._operations))
|
156
|
+
|
157
|
+
if self._current_dataset:
|
158
|
+
summary += f"\n\n最终结果:"
|
159
|
+
summary += f"\n - 数据集名称: {self._current_dataset.name}"
|
160
|
+
summary += f"\n - 图片数量: {len(self._current_dataset.images)}"
|
161
|
+
summary += f"\n - 类别: {self._current_dataset.categories}"
|
162
|
+
|
163
|
+
return summary
|
164
|
+
|
165
|
+
def reset(self) -> 'DatasetPipeline':
|
166
|
+
"""
|
167
|
+
重置管道状态
|
168
|
+
|
169
|
+
Returns:
|
170
|
+
self: 返回自身以支持链式调用
|
171
|
+
"""
|
172
|
+
self._datasets = []
|
173
|
+
self._current_dataset = None
|
174
|
+
self._operations = []
|
175
|
+
return self
|
176
|
+
|
File without changes
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# dataset_toolkit/processors/merger.py
|
2
|
+
import copy
|
3
|
+
from typing import List, Dict
|
4
|
+
from dataset_toolkit.models import Dataset
|
5
|
+
|
6
|
+
def merge_datasets(
|
7
|
+
datasets: List[Dataset],
|
8
|
+
category_mapping: Dict[str, str],
|
9
|
+
final_categories: Dict[int, str],
|
10
|
+
new_dataset_name: str = "merged_dataset"
|
11
|
+
) -> Dataset:
|
12
|
+
"""
|
13
|
+
基于一个确定的最终类别体系,合并多个数据集。
|
14
|
+
|
15
|
+
Args:
|
16
|
+
datasets (List[Dataset]): 需要合并的数据集对象列表。
|
17
|
+
category_mapping (Dict[str, str]): 从旧类别名到新类别名的映射规则。
|
18
|
+
例如: {'cat': 'animal', 'dog': 'animal', 'car': 'vehicle'}
|
19
|
+
final_categories (Dict[int, str]): 最终的、目标类别体系。
|
20
|
+
例如: {0: 'animal', 1: 'vehicle', 2: 'bicycle'}
|
21
|
+
new_dataset_name (str): 新合并数据集的名称。
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
Dataset: 一个全新的、合并后的数据集对象。
|
25
|
+
"""
|
26
|
+
if not datasets:
|
27
|
+
return Dataset(name=new_dataset_name)
|
28
|
+
|
29
|
+
# 1. 创建新数据集的框架,并为最终类别创建名称->ID的反向映射以便快速查找
|
30
|
+
merged_dataset = Dataset(name=new_dataset_name, categories=final_categories)
|
31
|
+
final_name_to_id = {name: id for id, name in final_categories.items()}
|
32
|
+
|
33
|
+
# 2. 遍历每个待合并的数据集
|
34
|
+
for ds in datasets:
|
35
|
+
# 为当前数据集构建一个从 旧ID -> 最终ID 的映射表
|
36
|
+
id_remap_table = {}
|
37
|
+
for old_id, old_name in ds.categories.items():
|
38
|
+
# 查找旧类别名对应的新类别名
|
39
|
+
final_name = category_mapping.get(old_name, old_name)
|
40
|
+
# 如果这个新类别名存在于我们最终的类别体系中,则记录ID映射关系
|
41
|
+
if final_name in final_name_to_id:
|
42
|
+
id_remap_table[old_id] = final_name_to_id[final_name]
|
43
|
+
|
44
|
+
# 3. 遍历并处理图片和标注
|
45
|
+
for image_ann in ds.images:
|
46
|
+
# 深拷贝以避免修改原始数据,确保函数无副作用
|
47
|
+
new_image_ann = copy.deepcopy(image_ann)
|
48
|
+
|
49
|
+
# 基于上面生成的id_remap_table,更新标注的category_id
|
50
|
+
updated_annotations = []
|
51
|
+
for ann in new_image_ann.annotations:
|
52
|
+
# 只有当一个标注的旧ID可以在重映射表中找到时,它才会被保留
|
53
|
+
if ann.category_id in id_remap_table:
|
54
|
+
ann.category_id = id_remap_table[ann.category_id]
|
55
|
+
updated_annotations.append(ann)
|
56
|
+
|
57
|
+
new_image_ann.annotations = updated_annotations
|
58
|
+
merged_dataset.images.append(new_image_ann)
|
59
|
+
|
60
|
+
print(f"合并完成. 新数据集 '{merged_dataset.name}' 包含 {len(merged_dataset.images)} 张图片。")
|
61
|
+
print(f"最终类别体系: {merged_dataset.categories}")
|
62
|
+
return merged_dataset
|
File without changes
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# dataset_toolkit/utils/coords.py
|
2
|
+
from typing import Tuple, List
|
3
|
+
|
4
|
+
def yolo_to_absolute_bbox(yolo_bbox: Tuple[float, ...], img_width: int, img_height: int) -> List[float]:
|
5
|
+
"""
|
6
|
+
将YOLO的相对坐标 (x_center, y_center, width, height) 转换为绝对像素坐标 (x_min, y_min, width, height)。
|
7
|
+
"""
|
8
|
+
x_center, y_center, rel_width, rel_height = yolo_bbox
|
9
|
+
|
10
|
+
abs_width = rel_width * img_width
|
11
|
+
abs_height = rel_height * img_height
|
12
|
+
x_min = (x_center * img_width) - (abs_width / 2)
|
13
|
+
y_min = (y_center * img_height) - (abs_height / 2)
|
14
|
+
|
15
|
+
return [x_min, y_min, abs_width, abs_height]
|
@@ -0,0 +1,273 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: dataset-toolkit
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: 一个用于加载、处理和导出计算机视觉数据集的工具包
|
5
|
+
Home-page: https://github.com/yourusername/dataset-toolkit
|
6
|
+
Author: Your Name
|
7
|
+
Author-email: Your Name <your.email@example.com>
|
8
|
+
License: MIT
|
9
|
+
Project-URL: Homepage, https://github.com/yourusername/dataset-toolkit
|
10
|
+
Project-URL: Documentation, https://dataset-toolkit.readthedocs.io
|
11
|
+
Project-URL: Repository, https://github.com/yourusername/dataset-toolkit
|
12
|
+
Project-URL: Bug Tracker, https://github.com/yourusername/dataset-toolkit/issues
|
13
|
+
Keywords: computer vision,dataset,yolo,coco,machine learning
|
14
|
+
Classifier: Development Status :: 3 - Alpha
|
15
|
+
Classifier: Intended Audience :: Developers
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
18
|
+
Classifier: License :: OSI Approved :: MIT License
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
20
|
+
Requires-Python: >=3.7
|
21
|
+
Description-Content-Type: text/markdown
|
22
|
+
License-File: LICENSE
|
23
|
+
Requires-Dist: Pillow>=8.0.0
|
24
|
+
Provides-Extra: dev
|
25
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
26
|
+
Requires-Dist: pytest-cov>=2.0; extra == "dev"
|
27
|
+
Requires-Dist: black>=21.0; extra == "dev"
|
28
|
+
Requires-Dist: flake8>=3.9; extra == "dev"
|
29
|
+
Dynamic: author
|
30
|
+
Dynamic: home-page
|
31
|
+
Dynamic: license-file
|
32
|
+
Dynamic: requires-python
|
33
|
+
|
34
|
+
# Dataset Toolkit
|
35
|
+
|
36
|
+
一个功能强大、易于使用的 Python 工具包,用于处理计算机视觉数据集。支持多种数据格式的加载、合并、转换和导出。
|
37
|
+
|
38
|
+
## ✨ 特性
|
39
|
+
|
40
|
+
- 🔄 **多格式支持**:支持 YOLO、COCO 等常见格式
|
41
|
+
- 🔗 **数据集合并**:轻松合并多个数据集,支持类别重映射
|
42
|
+
- 📤 **灵活导出**:导出为 COCO JSON、TXT 等多种格式
|
43
|
+
- 🛠️ **工具函数**:提供坐标转换等实用工具
|
44
|
+
- 📦 **标准化数据模型**:统一的内部数据表示,方便扩展
|
45
|
+
|
46
|
+
## 📦 安装
|
47
|
+
|
48
|
+
### 从 PyPI 安装(推荐)
|
49
|
+
|
50
|
+
```bash
|
51
|
+
pip install dataset-toolkit
|
52
|
+
```
|
53
|
+
|
54
|
+
### 从源码安装
|
55
|
+
|
56
|
+
```bash
|
57
|
+
git clone https://github.com/yourusername/dataset-toolkit.git
|
58
|
+
cd dataset-toolkit
|
59
|
+
pip install -e .
|
60
|
+
```
|
61
|
+
|
62
|
+
### 开发模式安装
|
63
|
+
|
64
|
+
```bash
|
65
|
+
pip install -e ".[dev]"
|
66
|
+
```
|
67
|
+
|
68
|
+
## 🚀 快速开始
|
69
|
+
|
70
|
+
### 基本用法
|
71
|
+
|
72
|
+
```python
|
73
|
+
from dataset_toolkit import load_yolo_from_local, merge_datasets, export_to_coco
|
74
|
+
|
75
|
+
# 1. 加载 YOLO 格式数据集
|
76
|
+
dataset1 = load_yolo_from_local(
|
77
|
+
"/path/to/dataset1",
|
78
|
+
categories={0: 'cat', 1: 'dog'}
|
79
|
+
)
|
80
|
+
|
81
|
+
dataset2 = load_yolo_from_local(
|
82
|
+
"/path/to/dataset2",
|
83
|
+
categories={0: 'car', 1: 'bicycle'}
|
84
|
+
)
|
85
|
+
|
86
|
+
# 2. 合并数据集(带类别重映射)
|
87
|
+
final_categories = {0: 'animal', 1: 'vehicle'}
|
88
|
+
category_mapping = {
|
89
|
+
'cat': 'animal',
|
90
|
+
'dog': 'animal',
|
91
|
+
'car': 'vehicle',
|
92
|
+
'bicycle': 'vehicle'
|
93
|
+
}
|
94
|
+
|
95
|
+
merged = merge_datasets(
|
96
|
+
datasets=[dataset1, dataset2],
|
97
|
+
category_mapping=category_mapping,
|
98
|
+
final_categories=final_categories,
|
99
|
+
new_dataset_name="merged_dataset"
|
100
|
+
)
|
101
|
+
|
102
|
+
# 3. 导出为 COCO 格式
|
103
|
+
export_to_coco(merged, "output/merged.json")
|
104
|
+
```
|
105
|
+
|
106
|
+
### 链式 API 用法
|
107
|
+
|
108
|
+
```python
|
109
|
+
from dataset_toolkit import DatasetPipeline
|
110
|
+
|
111
|
+
# 使用管道模式处理数据集
|
112
|
+
pipeline = DatasetPipeline()
|
113
|
+
result = (pipeline
|
114
|
+
.load_yolo("/path/to/dataset1", {0: 'cat', 1: 'dog'})
|
115
|
+
.load_yolo("/path/to/dataset2", {0: 'car'})
|
116
|
+
.merge(
|
117
|
+
category_mapping={'cat': 'animal', 'dog': 'animal', 'car': 'vehicle'},
|
118
|
+
final_categories={0: 'animal', 1: 'vehicle'}
|
119
|
+
)
|
120
|
+
.export_coco("output/merged.json")
|
121
|
+
.execute())
|
122
|
+
```
|
123
|
+
|
124
|
+
## 📚 API 文档
|
125
|
+
|
126
|
+
### 数据加载器
|
127
|
+
|
128
|
+
#### `load_yolo_from_local(dataset_path, categories)`
|
129
|
+
|
130
|
+
从本地文件系统加载 YOLO 格式的数据集。
|
131
|
+
|
132
|
+
**参数:**
|
133
|
+
- `dataset_path` (str): 数据集根目录路径,应包含 `images/` 和 `labels/` 子目录
|
134
|
+
- `categories` (Dict[int, str]): 类别ID到类别名的映射
|
135
|
+
|
136
|
+
**返回:**
|
137
|
+
- `Dataset`: 标准化的数据集对象
|
138
|
+
|
139
|
+
**示例:**
|
140
|
+
```python
|
141
|
+
dataset = load_yolo_from_local(
|
142
|
+
"/data/my_dataset",
|
143
|
+
categories={0: 'person', 1: 'car'}
|
144
|
+
)
|
145
|
+
```
|
146
|
+
|
147
|
+
### 数据处理器
|
148
|
+
|
149
|
+
#### `merge_datasets(datasets, category_mapping, final_categories, new_dataset_name)`
|
150
|
+
|
151
|
+
合并多个数据集,支持类别重映射。
|
152
|
+
|
153
|
+
**参数:**
|
154
|
+
- `datasets` (List[Dataset]): 要合并的数据集列表
|
155
|
+
- `category_mapping` (Dict[str, str]): 旧类别名到新类别名的映射
|
156
|
+
- `final_categories` (Dict[int, str]): 最终的类别体系
|
157
|
+
- `new_dataset_name` (str, optional): 合并后数据集的名称
|
158
|
+
|
159
|
+
**返回:**
|
160
|
+
- `Dataset`: 合并后的数据集对象
|
161
|
+
|
162
|
+
### 数据导出器
|
163
|
+
|
164
|
+
#### `export_to_coco(dataset, output_path)`
|
165
|
+
|
166
|
+
导出为 COCO JSON 格式。
|
167
|
+
|
168
|
+
**参数:**
|
169
|
+
- `dataset` (Dataset): 要导出的数据集
|
170
|
+
- `output_path` (str): 输出文件路径
|
171
|
+
|
172
|
+
#### `export_to_txt(dataset, output_path, use_relative_paths, base_path)`
|
173
|
+
|
174
|
+
导出为 TXT 格式。
|
175
|
+
|
176
|
+
**参数:**
|
177
|
+
- `dataset` (Dataset): 要导出的数据集
|
178
|
+
- `output_path` (str): 输出文件路径
|
179
|
+
- `use_relative_paths` (bool, optional): 是否使用相对路径
|
180
|
+
- `base_path` (str, optional): 相对路径的基准目录
|
181
|
+
|
182
|
+
## 🏗️ 架构设计
|
183
|
+
|
184
|
+
```
|
185
|
+
dataset_toolkit/
|
186
|
+
├── models.py # 数据模型定义
|
187
|
+
├── loaders/ # 数据加载器
|
188
|
+
│ ├── local_loader.py # 本地文件系统加载器
|
189
|
+
│ └── remote_loader.py # 远程数据源加载器(待开发)
|
190
|
+
├── processors/ # 数据处理器
|
191
|
+
│ ├── merger.py # 数据集合并
|
192
|
+
│ └── filter.py # 数据过滤(待开发)
|
193
|
+
├── exporters/ # 数据导出器
|
194
|
+
│ ├── coco_exporter.py # COCO格式导出
|
195
|
+
│ └── txt_exporter.py # TXT格式导出
|
196
|
+
└── utils/ # 工具函数
|
197
|
+
└── coords.py # 坐标转换
|
198
|
+
```
|
199
|
+
|
200
|
+
## 🔧 高级用法
|
201
|
+
|
202
|
+
### 自定义数据加载器
|
203
|
+
|
204
|
+
```python
|
205
|
+
from dataset_toolkit.models import Dataset, ImageAnnotation
|
206
|
+
from dataset_toolkit.loaders import BaseLoader
|
207
|
+
|
208
|
+
class CustomLoader(BaseLoader):
|
209
|
+
def load(self, path, **kwargs):
|
210
|
+
# 实现你的自定义加载逻辑
|
211
|
+
dataset = Dataset(name="custom")
|
212
|
+
# ... 加载数据 ...
|
213
|
+
return dataset
|
214
|
+
```
|
215
|
+
|
216
|
+
### 批量处理
|
217
|
+
|
218
|
+
```python
|
219
|
+
from pathlib import Path
|
220
|
+
from dataset_toolkit import load_yolo_from_local, export_to_coco
|
221
|
+
|
222
|
+
# 批量处理多个数据集
|
223
|
+
dataset_dirs = [
|
224
|
+
"/data/dataset1",
|
225
|
+
"/data/dataset2",
|
226
|
+
"/data/dataset3"
|
227
|
+
]
|
228
|
+
|
229
|
+
categories = {0: 'object'}
|
230
|
+
|
231
|
+
for dataset_dir in dataset_dirs:
|
232
|
+
ds = load_yolo_from_local(dataset_dir, categories)
|
233
|
+
output_name = Path(dataset_dir).name + ".json"
|
234
|
+
export_to_coco(ds, f"output/{output_name}")
|
235
|
+
```
|
236
|
+
|
237
|
+
## 🧪 测试
|
238
|
+
|
239
|
+
运行测试:
|
240
|
+
|
241
|
+
```bash
|
242
|
+
pytest
|
243
|
+
```
|
244
|
+
|
245
|
+
生成测试覆盖率报告:
|
246
|
+
|
247
|
+
```bash
|
248
|
+
pytest --cov=dataset_toolkit --cov-report=html
|
249
|
+
```
|
250
|
+
|
251
|
+
## 📝 开发计划
|
252
|
+
|
253
|
+
- [ ] 支持更多数据格式(Pascal VOC、YOLO v8等)
|
254
|
+
- [ ] 添加数据增强功能
|
255
|
+
- [ ] 支持远程数据源(S3、HTTP等)
|
256
|
+
- [ ] 添加数据统计和可视化功能
|
257
|
+
- [ ] 提供命令行工具
|
258
|
+
- [ ] 支持视频数据集
|
259
|
+
|
260
|
+
## 🤝 贡献
|
261
|
+
|
262
|
+
欢迎提交 Issue 和 Pull Request!
|
263
|
+
|
264
|
+
## 📄 许可证
|
265
|
+
|
266
|
+
MIT License
|
267
|
+
|
268
|
+
## 📧 联系方式
|
269
|
+
|
270
|
+
如有问题,请通过以下方式联系:
|
271
|
+
- Email: your.email@example.com
|
272
|
+
- GitHub Issues: https://github.com/yourusername/dataset-toolkit/issues
|
273
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
dataset_toolkit/__init__.py,sha256=yMm1ajpItXWlKdiqEmY3kRXDI9F0Voreg4hEH0xxM1s,1604
|
2
|
+
dataset_toolkit/models.py,sha256=uVtTbVYdHMECPL_waDhEebLvL_VwqSEm9XFC5QYIB10,767
|
3
|
+
dataset_toolkit/pipeline.py,sha256=iBJD7SemEVFTwzHxRQrjpUIQQcVdPSZnD4sB_y56Md0,5697
|
4
|
+
dataset_toolkit/exporters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
dataset_toolkit/exporters/coco_exporter.py,sha256=l5sfj7rOcvcMC0-4LNOEJ4PeklGQORDflU_um5GGnxA,2120
|
6
|
+
dataset_toolkit/exporters/txt_exporter.py,sha256=9nTWs6M89MdKJhlODtmfzeZqWkliXac9NMWPgVUrE7c,1246
|
7
|
+
dataset_toolkit/loaders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
+
dataset_toolkit/loaders/local_loader.py,sha256=Wy_hXY2B-SDxAmJGBYQpqBUe3cjz-k_McYhYf7cLgCk,2501
|
9
|
+
dataset_toolkit/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
dataset_toolkit/processors/merger.py,sha256=h8qQNgSmkPrhoQ3QiWEyIl11CmmjT5K1-8TzNb7_jbk,2834
|
11
|
+
dataset_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
+
dataset_toolkit/utils/coords.py,sha256=GtTQz2gFyFQfXhKfecI8tzqWFjraJY6Xo85-kRXYAYc,614
|
13
|
+
dataset_toolkit-0.1.0.dist-info/licenses/LICENSE,sha256=8_up1FX6vk2DRcusQEZ4pWJGkgkjvEkD14xB1hdLe3c,1067
|
14
|
+
dataset_toolkit-0.1.0.dist-info/METADATA,sha256=yjqPr_Wjioiw5v7AOkqduI5B_Y6oyBbKrTpJGIKVIWw,7225
|
15
|
+
dataset_toolkit-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
16
|
+
dataset_toolkit-0.1.0.dist-info/top_level.txt,sha256=B4D5vMLjUNJBZDdL7Utc0FYIfYoWbzyIGBMVYaeMd3U,16
|
17
|
+
dataset_toolkit-0.1.0.dist-info/RECORD,,
|
@@ -0,0 +1,22 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Your Name
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
22
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
dataset_toolkit
|