intellif-aihub 0.1.14__tar.gz → 0.1.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of intellif-aihub might be problematic. Click here for more details.

Files changed (63) hide show
  1. {intellif_aihub-0.1.14/src/intellif_aihub.egg-info → intellif_aihub-0.1.15}/PKG-INFO +1 -1
  2. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/pyproject.toml +1 -1
  3. intellif_aihub-0.1.15/src/aihub/__init__.py +1 -0
  4. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/artifact.py +16 -4
  5. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/dataset_management.py +23 -0
  6. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/artifact.py +16 -30
  7. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/dataset_management.py +176 -42
  8. intellif_aihub-0.1.15/src/aihub/utils/di.py +337 -0
  9. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/utils/download.py +3 -15
  10. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/utils/http.py +6 -0
  11. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15/src/intellif_aihub.egg-info}/PKG-INFO +1 -1
  12. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/intellif_aihub.egg-info/SOURCES.txt +2 -0
  13. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_dataset_management.py +15 -19
  14. intellif_aihub-0.1.15/tests/test_di.py +160 -0
  15. intellif_aihub-0.1.14/src/aihub/__init__.py +0 -1
  16. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/LICENSE +0 -0
  17. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/README.md +0 -0
  18. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/setup.cfg +0 -0
  19. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/client.py +0 -0
  20. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/exceptions.py +0 -0
  21. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/__init__.py +0 -0
  22. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/common.py +0 -0
  23. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/data_warehouse.py +0 -0
  24. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/document_center.py +0 -0
  25. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/eval.py +0 -0
  26. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/labelfree.py +0 -0
  27. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/model_center.py +0 -0
  28. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/model_training_platform.py +0 -0
  29. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/quota_schedule_management.py +0 -0
  30. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/tag_resource_management.py +0 -0
  31. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/task_center.py +0 -0
  32. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/user_system.py +0 -0
  33. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/models/workflow_center.py +0 -0
  34. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/__init__.py +0 -0
  35. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/data_warehouse.py +0 -0
  36. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/document_center.py +0 -0
  37. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/eval.py +0 -0
  38. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/labelfree.py +0 -0
  39. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/model_center.py +0 -0
  40. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/model_training_platform.py +0 -0
  41. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/quota_schedule_management.py +0 -0
  42. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/reporter.py +0 -0
  43. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/tag_resource_management.py +0 -0
  44. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/task_center.py +0 -0
  45. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/user_system.py +0 -0
  46. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/services/workflow_center.py +0 -0
  47. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/utils/__init__.py +0 -0
  48. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/aihub/utils/s3.py +0 -0
  49. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/intellif_aihub.egg-info/dependency_links.txt +0 -0
  50. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/intellif_aihub.egg-info/requires.txt +0 -0
  51. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/src/intellif_aihub.egg-info/top_level.txt +0 -0
  52. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_artifact.py +0 -0
  53. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_data_warehouse.py +0 -0
  54. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_document_center.py +0 -0
  55. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_labelfree.py +0 -0
  56. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_model_center.py +0 -0
  57. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_model_training_platform.py +0 -0
  58. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_quota_schedule_management.py +0 -0
  59. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_s3.py +0 -0
  60. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_tag_resource_management.py +0 -0
  61. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_task_center.py +0 -0
  62. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_user_system.py +0 -0
  63. {intellif_aihub-0.1.14 → intellif_aihub-0.1.15}/tests/test_workflow_center.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: intellif-aihub
3
- Version: 0.1.14
3
+ Version: 0.1.15
4
4
  Summary: Intellif AI-hub SDK.
5
5
  Author-email: Platform Team <aihub@example.com>
6
6
  License-Expression: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "intellif-aihub"
3
- version = "0.1.14"
3
+ version = "0.1.15"
4
4
  description = "Intellif AI-hub SDK."
5
5
  readme = {file = "README.md", content-type = "text/markdown"}
6
6
  requires-python = ">=3.9"
@@ -0,0 +1 @@
1
+ __version__ = "0.1.15"
@@ -14,6 +14,7 @@ from pydantic import BaseModel, Field
14
14
 
15
15
  class ArtifactType(str, Enum):
16
16
  """制品类型枚举:"dataset"-数据集类型;"model"-模型类型;"metrics"-指标类型;"log"-日志类型;"checkpoint"-检查点类型;"image"-图像类型;"prediction"-预测结果类型;"other"-其他类型"""
17
+
17
18
  dataset = "dataset" # 数据集类型
18
19
  model = "model" # 模型类型
19
20
  metrics = "metrics" # 指标类型
@@ -26,24 +27,29 @@ class ArtifactType(str, Enum):
26
27
 
27
28
  class CreateArtifactsReq(BaseModel):
28
29
  """创建制品请求"""
30
+
29
31
  entity_id: str = Field(alias="entity_id", description="实体ID,通常是运行ID,用于关联制品与特定运行")
30
- entity_type: ArtifactType = Field(default=ArtifactType.other, alias="entity_type",
31
- description="制品类型,指定制品的类型,默认为other")
32
+ entity_type: ArtifactType = Field(
33
+ default=ArtifactType.other, alias="entity_type", description="制品类型,指定制品的类型,默认为other"
34
+ )
32
35
  src_path: str = Field(alias="src_path", description="源路径,制品在系统中的路径标识")
33
- is_dir: bool = Field(default=False, alias="is_dir",
34
- description="是否为目录,True表示制品是一个目录,False表示是单个文件")
36
+ is_dir: bool = Field(
37
+ default=False, alias="is_dir", description="是否为目录,True表示制品是一个目录,False表示是单个文件"
38
+ )
35
39
 
36
40
  model_config = {"use_enum_values": True}
37
41
 
38
42
 
39
43
  class CreateArtifactsResponseData(BaseModel):
40
44
  """创建制品响应数据"""
45
+
41
46
  id: int = Field(description="制品ID")
42
47
  s3_path: str = Field(alias="s3_path", description="S3存储路径")
43
48
 
44
49
 
45
50
  class CreateArtifactsResponseModel(BaseModel):
46
51
  """创建制品响应模型"""
52
+
47
53
  code: int = Field(description="响应码,0表示成功")
48
54
  msg: str = Field(default="", description="响应消息")
49
55
  data: Optional[CreateArtifactsResponseData] = Field(default=None, description="响应数据")
@@ -51,6 +57,7 @@ class CreateArtifactsResponseModel(BaseModel):
51
57
 
52
58
  class CreateEvalReq(BaseModel):
53
59
  """创建评估请求"""
60
+
54
61
  dataset_id: int = Field(alias="dataset_id", description="数据集ID")
55
62
  dataset_version_id: int = Field(alias="dataset_version_id", description="数据集版本ID")
56
63
  prediction_artifact_path: str = Field(alias="prediction_artifact_path", description="预测结果制品路径")
@@ -62,6 +69,7 @@ class CreateEvalReq(BaseModel):
62
69
 
63
70
  class ArtifactResp(BaseModel):
64
71
  """制品响应模型,表示一个制品的详细信息"""
72
+
65
73
  id: int = Field(description="制品ID")
66
74
  entity_type: str = Field(alias="entity_type", description="实体类型")
67
75
  entity_id: str = Field(alias="entity_id", description="实体ID")
@@ -72,6 +80,7 @@ class ArtifactResp(BaseModel):
72
80
 
73
81
  class ArtifactRespData(BaseModel):
74
82
  """制品分页数据"""
83
+
75
84
  total: int = Field(description="总记录数")
76
85
  page_size: int = Field(alias="page_size", description="每页大小")
77
86
  page_num: int = Field(alias="page_num", description="页码")
@@ -80,6 +89,7 @@ class ArtifactRespData(BaseModel):
80
89
 
81
90
  class ArtifactRespModel(BaseModel):
82
91
  """获取制品响应模型"""
92
+
83
93
  code: int = Field(description="响应码,0表示成功")
84
94
  msg: str = Field(default="", description="响应消息")
85
95
  data: ArtifactRespData = Field(description="响应数据")
@@ -91,8 +101,10 @@ InfinityPageSize = 10000 * 100
91
101
 
92
102
  class StsResp(BaseModel):
93
103
  """STS 临时凭证"""
104
+
94
105
  access_key_id: Optional[str] = Field(default=None, alias="access_key_id", description="访问密钥ID")
95
106
  secret_access_key: Optional[str] = Field(default=None, alias="secret_access_key", description="秘密访问密钥")
96
107
  session_token: Optional[str] = Field(default=None, alias="session_token", description="会话令牌")
97
108
  expiration: Optional[int] = Field(default=None, alias="expiration", description="过期时间")
98
109
  endpoint: Optional[str] = Field(default=None, alias="endpoint", description="端点URL")
110
+ bucket: Optional[str] = Field(default=None, alias="bucket", description="存储桶名称")
@@ -8,6 +8,7 @@ from pydantic import BaseModel, Field
8
8
 
9
9
  class DatasetVersionStatus(IntEnum):
10
10
  """数据集版本状态:1-等待中;2-运行中;3-成功;4-失败;5-加载meta;6-构建index"""
11
+
11
12
  Waiting = 1 # 等待中
12
13
  Running = 2 # 运行中
13
14
  Success = 3 # 成功
@@ -18,6 +19,7 @@ class DatasetVersionStatus(IntEnum):
18
19
 
19
20
  class UploadType(IntEnum):
20
21
  """上传类型:1-本地上传;3-服务器路径上传;4-Labelfree;5-数据接入"""
22
+
21
23
  LOCAL = 1 # 本地上传
22
24
  SERVER_PATH = 3 # 服务器路径上传
23
25
  LABELFREE = 4 # Labelfree
@@ -26,6 +28,7 @@ class UploadType(IntEnum):
26
28
 
27
29
  class CreateDatasetRequest(BaseModel):
28
30
  """创建数据集请求"""
31
+
29
32
  name: str = Field(description="数据集名称")
30
33
  description: str = Field(description="数据集描述")
31
34
  tags: List[int] = Field(description="标签ID列表,通过标签管理系统查询")
@@ -37,11 +40,13 @@ class CreateDatasetRequest(BaseModel):
37
40
 
38
41
  class CreateDatasetResponse(BaseModel):
39
42
  """创建数据集返回"""
43
+
40
44
  id: int = Field(alias="id", description="数据集ID")
41
45
 
42
46
 
43
47
  class DatasetVersionBase(BaseModel):
44
48
  """数据集版本概要"""
49
+
45
50
  id: int = Field(description="版本ID")
46
51
  version: int = Field(description="版本号")
47
52
  status: DatasetVersionStatus = Field(description="版本状态")
@@ -53,6 +58,7 @@ class DatasetVersionBase(BaseModel):
53
58
 
54
59
  class DatasetDetail(BaseModel):
55
60
  """数据集详情"""
61
+
56
62
  id: int = Field(description="数据集 ID")
57
63
  name: str = Field(description="名称")
58
64
  description: str = Field(description="描述")
@@ -69,6 +75,7 @@ class DatasetDetail(BaseModel):
69
75
 
70
76
  class ExtInfo(BaseModel):
71
77
  """扩展信息"""
78
+
72
79
  rec_file_path: Optional[str] = Field(None, alias="rec_file_path", description="rec文件路径")
73
80
  idx_file_path: Optional[str] = Field(None, alias="idx_file_path", description="idx文件路径")
74
81
  json_file_path: Optional[str] = Field(None, alias="json_file_path", description="json文件路径")
@@ -77,6 +84,7 @@ class ExtInfo(BaseModel):
77
84
 
78
85
  class CreateDatasetVersionRequest(BaseModel):
79
86
  """创建版本请求"""
87
+
80
88
  upload_path: str = Field(alias="upload_path", description="上传路径")
81
89
  description: Optional[str] = Field(None, description="版本描述")
82
90
  dataset_id: int = Field(alias="dataset_id", description="数据集ID")
@@ -91,11 +99,13 @@ class CreateDatasetVersionRequest(BaseModel):
91
99
 
92
100
  class CreateDatasetVersionResponse(BaseModel):
93
101
  """创建版本返回"""
102
+
94
103
  id: int = Field(alias="id", description="版本ID")
95
104
 
96
105
 
97
106
  class UploadDatasetVersionRequest(BaseModel):
98
107
  """上传数据集版本请求"""
108
+
99
109
  upload_path: str = Field(alias="upload_path", description="上传目录")
100
110
  upload_type: UploadType = Field(alias="upload_type", description="上传类型")
101
111
  dataset_id: int = Field(alias="dataset_id", description="数据集ID")
@@ -107,11 +117,13 @@ class UploadDatasetVersionRequest(BaseModel):
107
117
 
108
118
  class UploadDatasetVersionResponse(BaseModel):
109
119
  """上传数据集版本返回"""
120
+
110
121
  id: int = Field(alias="id", description="版本ID")
111
122
 
112
123
 
113
124
  class DatasetVersionDetail(BaseModel):
114
125
  """数据集版本详情"""
126
+
115
127
  id: int = Field(description="版本ID")
116
128
  version: int = Field(description="版本号")
117
129
  dataset_id: int = Field(alias="dataset_id", description="数据集ID")
@@ -133,6 +145,7 @@ class DatasetVersionDetail(BaseModel):
133
145
 
134
146
  class FileUploadData(BaseModel):
135
147
  """文件上传数据"""
148
+
136
149
  path: str = Field(description="路径")
137
150
  url: str = Field(description="URL")
138
151
 
@@ -203,3 +216,13 @@ class ListDatasetVersionResp(BaseModel):
203
216
  page_size: int = Field(alias="page_size", description="每页大小")
204
217
  page_num: int = Field(alias="page_num", description="当前页码")
205
218
  data: List[ListDatasetVersionItem] = Field(description="数据集版本列表")
219
+
220
+
221
+ class CreateDatasetVersionByDataIngestReqV2(BaseModel):
222
+ """通过数据集成创建数据集版本请求"""
223
+
224
+ description: Optional[str] = Field(None, description="描述")
225
+ dataset_id: int = Field(..., description="数据集ID")
226
+ s3_object_sheet: str = Field(..., description="S3对象表")
227
+ object_cnt: Optional[int] = Field(None, description="对象数量")
228
+ data_size: Optional[int] = Field(None, description="数据大小")
@@ -98,9 +98,7 @@ class ArtifactService:
98
98
  """
99
99
  return self._artifact.get_sts()
100
100
 
101
- def get_by_run_id(
102
- self, run_id: str, artifact_path: Optional[str] = None
103
- ) -> List[ArtifactResp]:
101
+ def get_by_run_id(self, run_id: str, artifact_path: Optional[str] = None) -> List[ArtifactResp]:
104
102
  """根据运行ID获取制品列表
105
103
 
106
104
  Args:
@@ -116,11 +114,11 @@ class ArtifactService:
116
114
  return self._artifact.get_by_run_id(run_id, artifact_path)
117
115
 
118
116
  def create_artifact(
119
- self,
120
- local_path: str,
121
- artifact_path: Optional[str] = None,
122
- run_id: Optional[str] = None,
123
- artifact_type: ArtifactType = ArtifactType.other,
117
+ self,
118
+ local_path: str,
119
+ artifact_path: Optional[str] = None,
120
+ run_id: Optional[str] = None,
121
+ artifact_type: ArtifactType = ArtifactType.other,
124
122
  ) -> None:
125
123
  """创建单个文件制品并上传
126
124
 
@@ -171,11 +169,11 @@ class ArtifactService:
171
169
  return
172
170
 
173
171
  def create_artifacts(
174
- self,
175
- local_dir: str,
176
- artifact_path: Optional[str] = None,
177
- run_id: Optional[str] = None,
178
- artifact_type: ArtifactType = ArtifactType.other,
172
+ self,
173
+ local_dir: str,
174
+ artifact_path: Optional[str] = None,
175
+ run_id: Optional[str] = None,
176
+ artifact_type: ArtifactType = ArtifactType.other,
179
177
  ) -> None:
180
178
  """创建目录制品并上传
181
179
 
@@ -223,9 +221,7 @@ class ArtifactService:
223
221
  logger.info(f"log artifact done: {artifact_path}")
224
222
  return
225
223
 
226
- def download_artifacts(
227
- self, run_id: str, artifact_path: Optional[str], local_dir: str
228
- ) -> None:
224
+ def download_artifacts(self, run_id: str, artifact_path: Optional[str], local_dir: str) -> None:
229
225
  """下载制品
230
226
 
231
227
  Args:
@@ -252,9 +248,7 @@ class ArtifactService:
252
248
  if artifact_item.is_dir:
253
249
  download_dir_from_s3(self.s3_client, bucket, object_name, local_dir)
254
250
  else:
255
- self.s3_client.fget_object(
256
- bucket, object_name, str(Path(local_dir) / artifact_item.src_path)
257
- )
251
+ self.s3_client.fget_object(bucket, object_name, str(Path(local_dir) / artifact_item.src_path))
258
252
 
259
253
  logger.info(f"download artifact done: {artifact_path}")
260
254
  return
@@ -311,9 +305,7 @@ class _Artifact:
311
305
  raise APIError(f"backend code {wrapper.code}: {wrapper.msg}")
312
306
  return
313
307
 
314
- def get_by_run_id(
315
- self, run_id: str, artifact_path: Optional[str]
316
- ) -> List[ArtifactResp]:
308
+ def get_by_run_id(self, run_id: str, artifact_path: Optional[str]) -> List[ArtifactResp]:
317
309
  """根据运行ID获取制品列表
318
310
 
319
311
  Args:
@@ -326,18 +318,12 @@ class _Artifact:
326
318
  Raises:
327
319
  APIError: 当API调用失败时抛出
328
320
  """
329
- resp = self._http.get(
330
- f"{_Base}/artifacts?entity_id={run_id}&page_num=1&page_size={InfinityPageSize}"
331
- )
321
+ resp = self._http.get(f"{_Base}/artifacts?entity_id={run_id}&page_num=1&page_size={InfinityPageSize}")
332
322
  wrapper = APIWrapper[ArtifactRespData].model_validate(resp.json())
333
323
  if wrapper.code != 0:
334
324
  raise APIError(f"backend code {wrapper.code}: {wrapper.msg}")
335
325
  if artifact_path:
336
- return [
337
- artifact
338
- for artifact in wrapper.data.data
339
- if artifact.src_path == artifact_path
340
- ]
326
+ return [artifact for artifact in wrapper.data.data if artifact.src_path == artifact_path]
341
327
  else:
342
328
  return wrapper.data.data
343
329
 
@@ -17,7 +17,6 @@ from __future__ import annotations
17
17
  import mimetypes
18
18
  import os
19
19
  import pathlib
20
- import tempfile
21
20
  import time
22
21
  import uuid
23
22
 
@@ -25,6 +24,7 @@ import httpx
25
24
  from loguru import logger
26
25
 
27
26
  from ..exceptions import APIError
27
+ from ..models.artifact import StsResp
28
28
  from ..models.common import APIWrapper
29
29
  from ..models.dataset_management import (
30
30
  CreateDatasetRequest,
@@ -40,9 +40,12 @@ from ..models.dataset_management import (
40
40
  ListDatasetResp,
41
41
  ListDatasetVersionReq,
42
42
  ListDatasetVersionResp,
43
+ CreateDatasetVersionByDataIngestReqV2,
44
+ UploadType,
43
45
  )
44
46
  from ..models.dataset_management import DatasetVersionStatus
45
- from ..utils.download import dataset_download, zip_dir
47
+ from ..utils.di import SimpleS3Client, DataUploader
48
+ from ..utils.download import dataset_download
46
49
 
47
50
  _BASE = "/dataset-mng/api/v2"
48
51
 
@@ -138,20 +141,29 @@ class DatasetManagementService:
138
141
  def dataset(self) -> _Dataset:
139
142
  return self._dataset
140
143
 
144
+ def _get_sts(self) -> StsResp:
145
+ return self.dataset_version.get_sts()
146
+
141
147
  @property
142
148
  def dataset_version(self) -> _DatasetVersion:
143
149
  return self._dataset_version
144
150
 
151
+ def upload_by_data_ingest(
152
+ self,
153
+ req: CreateDatasetVersionByDataIngestReqV2,
154
+ ) -> CreateDatasetVersionResponse:
155
+ return self.dataset_version.upload_by_data_ingest(req)
156
+
145
157
  def create_dataset_and_version(
146
- self,
147
- *,
148
- dataset_name: str,
149
- dataset_description: str = "",
150
- is_local_upload: bool,
151
- local_file_path: str | None = None,
152
- server_file_path: str | None = None,
153
- version_description: str = "",
154
- timeout: int = 1_800,
158
+ self,
159
+ *,
160
+ dataset_name: str,
161
+ dataset_description: str = "",
162
+ is_local_upload: bool,
163
+ local_file_path: str | None = None,
164
+ server_file_path: str | None = None,
165
+ version_description: str = "",
166
+ timeout: int = 1_800,
155
167
  ) -> tuple[int, int, str]:
156
168
  """创建数据集及其版本,并等待版本状态变为 *Success*。
157
169
 
@@ -169,17 +181,51 @@ class DatasetManagementService:
169
181
 
170
182
  Returns:
171
183
  tuple[int, int, str]: 一个三元组,包含:[数据集 ID,数据集版本 ID, 数据集版本标签(格式为 <dataset_name>/V<version_number>)]
184
+
185
+ Raises:
186
+ ValueError: 当参数不满足要求时
187
+ APIError: 当后端返回错误时
188
+ TimeoutError: 当等待超时时
172
189
  """
190
+ # 参数校验
191
+ self._validate_create_params(is_local_upload, local_file_path, server_file_path)
192
+
193
+ # 创建数据集
194
+ dataset_id = self._create_dataset(dataset_name, dataset_description)
195
+ logger.info(f"创建数据集成功,名称为 {dataset_name} ,开始准备创建版本、上传数据")
196
+
197
+ # 创建数据集版本
198
+ version_id = self._create_dataset_version(
199
+ dataset_id=dataset_id,
200
+ is_local_upload=is_local_upload,
201
+ local_file_path=local_file_path,
202
+ server_file_path=server_file_path,
203
+ version_description=version_description,
204
+ )
205
+
206
+ # 获取版本标签
207
+ version_tag = self._get_version_tag(dataset_id, version_id)
208
+ logger.info(f"数据集版本创建成功,名称为 {version_tag},开始轮询状态…")
209
+
210
+ # 轮询等待版本状态变为成功
211
+ self._wait_for_version_success(version_id, timeout)
212
+
213
+ return dataset_id, version_id, version_tag
214
+
215
+ def _validate_create_params(
216
+ self, is_local_upload: bool, local_file_path: str | None, server_file_path: str | None
217
+ ) -> None:
218
+ """验证创建数据集和版本所需的参数"""
173
219
  if is_local_upload:
174
220
  if not local_file_path:
175
221
  raise ValueError("is_local_upload=True 时必须提供 local_file_path")
176
- upload_type = 1
177
222
  else:
178
223
  if not server_file_path:
179
224
  raise ValueError("is_local_upload=False 时必须提供 server_file_path")
180
- upload_type = 3
181
225
 
182
- dataset_id = self._dataset.create(
226
+ def _create_dataset(self, dataset_name: str, dataset_description: str) -> int:
227
+ """创建数据集"""
228
+ return self._dataset.create(
183
229
  CreateDatasetRequest(
184
230
  name=dataset_name,
185
231
  description=dataset_description,
@@ -190,39 +236,96 @@ class DatasetManagementService:
190
236
  access_user_ids=None,
191
237
  )
192
238
  )
193
- logger.info(
194
- f"创建数据集成功,名称为 {dataset_name} ,开始准备创建版本、上传数据"
195
- )
196
239
 
240
+ def _create_dataset_version(
241
+ self,
242
+ dataset_id: int,
243
+ is_local_upload: bool,
244
+ local_file_path: str | None,
245
+ server_file_path: str | None,
246
+ version_description: str,
247
+ ) -> int:
248
+ """根据上传类型创建数据集版本"""
197
249
  if is_local_upload:
198
- # 上传文件,检查是文件夹还是zip
199
- local_file_path = pathlib.Path(local_file_path)
200
- if local_file_path.is_dir():
201
- # 把文件夹打包为一个 zip
202
- temp_zip_path = (
203
- pathlib.Path(tempfile.mkdtemp()) / f" {uuid.uuid4().hex}.zip"
204
- )
205
- zip_dir(local_file_path, temp_zip_path)
206
- upload_data = self._upload.upload_file(temp_zip_path)
207
- os.remove(temp_zip_path)
208
- else:
209
- upload_data = self._upload.upload_file(local_file_path)
210
-
211
- upload_path = upload_data.path
250
+ return self._create_local_dataset_version(dataset_id, local_file_path, version_description)
212
251
  else:
213
- upload_path = server_file_path
214
- logger.info(f"文件上传成功:{local_file_path}")
252
+ return self._create_server_dataset_version(dataset_id, server_file_path, version_description)
253
+
254
+ def _create_local_dataset_version(
255
+ self, dataset_id: int, local_file_path: str | None, version_description: str
256
+ ) -> int:
257
+ """创建本地文件数据集版本"""
258
+ if pathlib.Path(local_file_path).is_dir():
259
+ return self._create_local_dir_dataset_version(dataset_id, local_file_path)
260
+ elif pathlib.Path(local_file_path).is_file():
261
+ return self._create_local_file_dataset_version(dataset_id, local_file_path, version_description)
262
+ else:
263
+ raise ValueError(f"本地路径既不是文件也不是目录: {local_file_path}")
264
+
265
+ def _create_local_dir_dataset_version(self, dataset_id: int, local_file_path: str) -> int:
266
+ """处理本地目录上传"""
267
+ sts = self._get_sts()
268
+ s3_client = SimpleS3Client(
269
+ sts.endpoint, sts.access_key_id, sts.secret_access_key, session_token=sts.session_token
270
+ )
271
+ uid = uuid.uuid4().hex
272
+ s3_target = f"s3://{sts.bucket}/dataset_workspace/{dataset_id}/{uid}"
273
+ s3_csv_path = f"s3://{sts.bucket}/dataset_workspace/{dataset_id}/{uid}.csv"
274
+ s3_status_path = f"s3://{sts.bucket}/dataset_workspace/{dataset_id}/{uid}.json"
275
+
276
+ # 创建上传器并执行
277
+ uploader = DataUploader(
278
+ task_id=dataset_id,
279
+ local_path=str(local_file_path),
280
+ s3_target=s3_target,
281
+ csv_path=s3_csv_path,
282
+ status_path=s3_status_path,
283
+ num_workers=40,
284
+ )
215
285
 
216
- version_id = self._dataset_version.upload(
286
+ upload_stats = uploader.run(s3_client)
287
+ req = CreateDatasetVersionByDataIngestReqV2(
288
+ description=f"sdk 上传",
289
+ dataset_id=dataset_id,
290
+ s3_object_sheet=s3_csv_path,
291
+ object_cnt=upload_stats.uploaded_count,
292
+ data_size=upload_stats.uploaded_size,
293
+ )
294
+ return self.upload_by_data_ingest(req).id
295
+
296
+ def _create_local_file_dataset_version(
297
+ self, dataset_id: int, local_file_path: str, version_description: str
298
+ ) -> int:
299
+ """处理本地文件上传"""
300
+ upload_data = self._upload.upload_file(local_file_path)
301
+ upload_path = upload_data.path
302
+ logger.info(f"文件上传成功:{local_file_path}")
303
+ return self._dataset_version.upload(
217
304
  UploadDatasetVersionRequest(
218
305
  upload_path=upload_path,
219
- upload_type=upload_type,
306
+ upload_type=UploadType.LOCAL, # 本地上传类型
307
+ dataset_id=dataset_id,
308
+ description=version_description,
309
+ parent_version_id=0,
310
+ )
311
+ )
312
+
313
+ def _create_server_dataset_version(
314
+ self, dataset_id: int, server_file_path: str | None, version_description: str
315
+ ) -> int:
316
+ """创建服务器文件数据集版本"""
317
+ return self._dataset_version.upload(
318
+ UploadDatasetVersionRequest(
319
+ upload_path=server_file_path,
320
+ upload_type=UploadType.SERVER_PATH, # 服务器文件上传类型
220
321
  dataset_id=dataset_id,
221
322
  description=version_description,
222
323
  parent_version_id=0,
223
324
  )
224
325
  )
225
326
 
327
+ def _get_version_tag(self, dataset_id: int, version_id: int) -> str:
328
+ """获取版本标签"""
226
329
  detail = self._dataset.get(dataset_id)
227
330
  ver_num = next(
228
331
  (v.version for v in detail.versions if v.id == version_id),
@@ -231,9 +334,10 @@ class DatasetManagementService:
231
334
  if ver_num is None:
232
335
  ver_num = 1
233
336
 
234
- version_tag = f"{detail.name}/V{ver_num}"
235
- logger.info(f"数据集版本创建成功,名称为 {version_tag},开始轮询状态…")
337
+ return f"{detail.name}/V{ver_num}"
236
338
 
339
+ def _wait_for_version_success(self, version_id: int, timeout: int) -> None:
340
+ """轮询等待版本状态变为成功"""
237
341
  start_ts = time.time()
238
342
  poll_interval = 10
239
343
 
@@ -255,8 +359,6 @@ class DatasetManagementService:
255
359
  logger.debug(f"已等待 {elapsed:.0f}s,继续轮询…")
256
360
  time.sleep(poll_interval)
257
361
 
258
- return dataset_id, version_id, version_tag
259
-
260
362
  def run_download(self, dataset_version_name: str, local_dir: str, worker: int = 4) -> None:
261
363
  """根据数据集版本名称下载对应的数据集文件。
262
364
 
@@ -400,9 +502,7 @@ class _DatasetVersion:
400
502
  return wrapper.data
401
503
 
402
504
  def get_by_name(self, version_name: str) -> DatasetVersionDetail:
403
- resp = self._http.get(
404
- f"{_BASE}/dataset-versions-detail", params={"name": version_name}
405
- )
505
+ resp = self._http.get(f"{_BASE}/dataset-versions-detail", params={"name": version_name})
406
506
  wrapper = APIWrapper[DatasetVersionDetail].model_validate(resp.json())
407
507
  if wrapper.code != 0:
408
508
  raise APIError(f"backend code {wrapper.code}: {wrapper.msg}")
@@ -417,6 +517,40 @@ class _DatasetVersion:
417
517
  raise APIError(f"backend code {wrapper.code}: {wrapper.msg}")
418
518
  return wrapper.data
419
519
 
520
+ def get_sts(self) -> StsResp:
521
+ """获取STS临时凭证
522
+
523
+ 获取用于访问S3存储的临时凭证。
524
+
525
+ Returns:
526
+ StsResp: STS临时凭证信息
527
+
528
+ Raises:
529
+ APIError: 当API调用失败时抛出
530
+ """
531
+ resp = self._http.get(f"{_BASE}/dataset-versions/get-sts")
532
+ logger.info(f"get sts: {resp.text}")
533
+ wrapper = APIWrapper[StsResp].model_validate(resp.json())
534
+ if wrapper.code != 0:
535
+ raise APIError(f"backend code {wrapper.code}: {wrapper.msg}")
536
+ return wrapper.data
537
+
538
+ def upload_by_data_ingest(self, req: CreateDatasetVersionByDataIngestReqV2) -> CreateDatasetVersionResponse:
539
+ """上传数据集版本(数据集导入)
540
+ Args:
541
+ req
542
+
543
+ """
544
+ resp = self._http.post(
545
+ f"{_BASE}/dataset-versions/data-ingest",
546
+ json=req.model_dump(),
547
+ )
548
+ logger.debug(f"upload_by_data_ingest: {resp.text}")
549
+ wrapper = APIWrapper[CreateDatasetVersionResponse].model_validate(resp.json())
550
+ if wrapper.code != 0:
551
+ raise APIError(f"backend code {wrapper.code}: {wrapper.msg}")
552
+ return wrapper.data
553
+
420
554
 
421
555
  class _Upload:
422
556
  def __init__(self, http: httpx.Client):