rapidata 2.35.2__py3-none-any.whl → 2.35.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rapidata might be problematic. Click here for more details.
- rapidata/__init__.py +2 -1
- rapidata/rapidata_client/__init__.py +5 -13
- rapidata/rapidata_client/benchmark/participant/_participant.py +45 -26
- rapidata/rapidata_client/benchmark/rapidata_benchmark_manager.py +73 -30
- rapidata/rapidata_client/config/__init__.py +1 -0
- rapidata/rapidata_client/config/config.py +33 -0
- rapidata/rapidata_client/datapoints/assets/_sessions.py +13 -8
- rapidata/rapidata_client/order/_rapidata_dataset.py +23 -33
- {rapidata-2.35.2.dist-info → rapidata-2.35.3.dist-info}/METADATA +1 -1
- {rapidata-2.35.2.dist-info → rapidata-2.35.3.dist-info}/RECORD +12 -10
- {rapidata-2.35.2.dist-info → rapidata-2.35.3.dist-info}/LICENSE +0 -0
- {rapidata-2.35.2.dist-info → rapidata-2.35.3.dist-info}/WHEEL +0 -0
rapidata/__init__.py
CHANGED
|
@@ -16,14 +16,9 @@ from .datapoints.metadata import (
|
|
|
16
16
|
PromptMetadata,
|
|
17
17
|
SelectWordsMetadata,
|
|
18
18
|
)
|
|
19
|
-
from .datapoints.assets import
|
|
20
|
-
MediaAsset,
|
|
21
|
-
TextAsset,
|
|
22
|
-
MultiAsset,
|
|
23
|
-
RapidataDataTypes
|
|
24
|
-
)
|
|
19
|
+
from .datapoints.assets import MediaAsset, TextAsset, MultiAsset, RapidataDataTypes
|
|
25
20
|
from .settings import (
|
|
26
|
-
RapidataSettings,
|
|
21
|
+
RapidataSettings,
|
|
27
22
|
TranslationBehaviourOptions,
|
|
28
23
|
AlertOnFastResponse,
|
|
29
24
|
TranslationBehaviour,
|
|
@@ -32,7 +27,7 @@ from .settings import (
|
|
|
32
27
|
PlayVideoUntilTheEnd,
|
|
33
28
|
CustomSetting,
|
|
34
29
|
AllowNeitherBoth,
|
|
35
|
-
|
|
30
|
+
)
|
|
36
31
|
from .country_codes import CountryCodes
|
|
37
32
|
from .filter import (
|
|
38
33
|
CountryFilter,
|
|
@@ -49,11 +44,8 @@ from .filter import (
|
|
|
49
44
|
ResponseCountFilter,
|
|
50
45
|
)
|
|
51
46
|
|
|
52
|
-
from .logging import
|
|
53
|
-
configure_logger,
|
|
54
|
-
logger,
|
|
55
|
-
RapidataOutputManager
|
|
56
|
-
)
|
|
47
|
+
from .logging import configure_logger, logger, RapidataOutputManager
|
|
57
48
|
|
|
58
49
|
from .validation import Box
|
|
59
50
|
from .exceptions import FailedUploadException
|
|
51
|
+
from .config import rapidata_config
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
2
|
+
import time
|
|
2
3
|
from tqdm import tqdm
|
|
3
4
|
|
|
4
5
|
from rapidata.rapidata_client.datapoints.assets import MediaAsset
|
|
@@ -6,6 +7,10 @@ from rapidata.rapidata_client.logging import logger
|
|
|
6
7
|
from rapidata.rapidata_client.logging.output_manager import RapidataOutputManager
|
|
7
8
|
from rapidata.api_client.models.create_sample_model import CreateSampleModel
|
|
8
9
|
from rapidata.service.openapi_service import OpenAPIService
|
|
10
|
+
from rapidata.rapidata_client.config.config import rapidata_config
|
|
11
|
+
from rapidata.rapidata_client.api.rapidata_exception import (
|
|
12
|
+
suppress_rapidata_error_logging,
|
|
13
|
+
)
|
|
9
14
|
|
|
10
15
|
|
|
11
16
|
class BenchmarkParticipant:
|
|
@@ -21,11 +26,11 @@ class BenchmarkParticipant:
|
|
|
21
26
|
) -> tuple[MediaAsset | None, MediaAsset | None]:
|
|
22
27
|
"""
|
|
23
28
|
Process single sample upload with retry logic and error tracking.
|
|
24
|
-
|
|
29
|
+
|
|
25
30
|
Args:
|
|
26
31
|
asset: MediaAsset to upload
|
|
27
32
|
identifier: Identifier for the sample
|
|
28
|
-
|
|
33
|
+
|
|
29
34
|
Returns:
|
|
30
35
|
tuple[MediaAsset | None, MediaAsset | None]: (successful_asset, failed_asset)
|
|
31
36
|
"""
|
|
@@ -37,20 +42,30 @@ class BenchmarkParticipant:
|
|
|
37
42
|
urls = [asset.path]
|
|
38
43
|
|
|
39
44
|
last_exception = None
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
45
|
+
for attempt in range(rapidata_config.upload_max_retries):
|
|
46
|
+
try:
|
|
47
|
+
with suppress_rapidata_error_logging():
|
|
48
|
+
self.__openapi_service.participant_api.participant_participant_id_sample_post(
|
|
49
|
+
participant_id=self.id,
|
|
50
|
+
model=CreateSampleModel(identifier=identifier),
|
|
51
|
+
files=files,
|
|
52
|
+
urls=urls,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
return asset, None
|
|
56
|
+
|
|
57
|
+
except Exception as e:
|
|
58
|
+
last_exception = e
|
|
59
|
+
if attempt < rapidata_config.upload_max_retries - 1:
|
|
60
|
+
# Exponential backoff: wait 1s, then 2s, then 4s
|
|
61
|
+
retry_delay = 2**attempt
|
|
62
|
+
time.sleep(retry_delay)
|
|
63
|
+
logger.debug("Error: %s", str(last_exception))
|
|
64
|
+
logger.debug(
|
|
65
|
+
"Retrying %s of %s...",
|
|
66
|
+
attempt + 1,
|
|
67
|
+
rapidata_config.upload_max_retries,
|
|
68
|
+
)
|
|
54
69
|
|
|
55
70
|
logger.error(f"Upload failed for {identifier}. Error: {str(last_exception)}")
|
|
56
71
|
return None, asset
|
|
@@ -59,24 +74,24 @@ class BenchmarkParticipant:
|
|
|
59
74
|
self,
|
|
60
75
|
assets: list[MediaAsset],
|
|
61
76
|
identifiers: list[str],
|
|
62
|
-
max_workers: int = 10,
|
|
63
77
|
) -> tuple[list[MediaAsset], list[MediaAsset]]:
|
|
64
78
|
"""
|
|
65
79
|
Upload samples concurrently with proper error handling and progress tracking.
|
|
66
|
-
|
|
80
|
+
|
|
67
81
|
Args:
|
|
68
82
|
assets: List of MediaAsset objects to upload
|
|
69
83
|
identifiers: List of identifiers matching the assets
|
|
70
|
-
|
|
71
|
-
|
|
84
|
+
|
|
72
85
|
Returns:
|
|
73
86
|
tuple[list[str], list[str]]: Lists of successful and failed identifiers
|
|
74
87
|
"""
|
|
75
88
|
successful_uploads: list[MediaAsset] = []
|
|
76
89
|
failed_uploads: list[MediaAsset] = []
|
|
77
90
|
total_uploads = len(assets)
|
|
78
|
-
|
|
79
|
-
with ThreadPoolExecutor(
|
|
91
|
+
|
|
92
|
+
with ThreadPoolExecutor(
|
|
93
|
+
max_workers=rapidata_config.max_upload_workers
|
|
94
|
+
) as executor:
|
|
80
95
|
futures = [
|
|
81
96
|
executor.submit(
|
|
82
97
|
self._process_single_sample_upload,
|
|
@@ -85,8 +100,12 @@ class BenchmarkParticipant:
|
|
|
85
100
|
)
|
|
86
101
|
for asset, identifier in zip(assets, identifiers)
|
|
87
102
|
]
|
|
88
|
-
|
|
89
|
-
with tqdm(
|
|
103
|
+
|
|
104
|
+
with tqdm(
|
|
105
|
+
total=total_uploads,
|
|
106
|
+
desc="Uploading media",
|
|
107
|
+
disable=RapidataOutputManager.silent_mode,
|
|
108
|
+
) as pbar:
|
|
90
109
|
for future in as_completed(futures):
|
|
91
110
|
try:
|
|
92
111
|
successful_id, failed_id = future.result()
|
|
@@ -96,7 +115,7 @@ class BenchmarkParticipant:
|
|
|
96
115
|
failed_uploads.append(failed_id)
|
|
97
116
|
except Exception as e:
|
|
98
117
|
logger.error(f"Future execution failed: {str(e)}")
|
|
99
|
-
|
|
118
|
+
|
|
100
119
|
pbar.update(1)
|
|
101
|
-
|
|
120
|
+
|
|
102
121
|
return successful_uploads, failed_uploads
|
|
@@ -8,6 +8,7 @@ from rapidata.api_client.models.root_filter import RootFilter
|
|
|
8
8
|
from rapidata.api_client.models.filter import Filter
|
|
9
9
|
from rapidata.api_client.models.sort_criterion import SortCriterion
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
class RapidataBenchmarkManager:
|
|
12
13
|
"""
|
|
13
14
|
A manager for benchmarks.
|
|
@@ -19,16 +20,18 @@ class RapidataBenchmarkManager:
|
|
|
19
20
|
Args:
|
|
20
21
|
openapi_service: The OpenAPIService instance for API interaction.
|
|
21
22
|
"""
|
|
23
|
+
|
|
22
24
|
def __init__(self, openapi_service: OpenAPIService):
|
|
23
25
|
self.__openapi_service = openapi_service
|
|
24
26
|
|
|
25
|
-
def create_new_benchmark(
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
27
|
+
def create_new_benchmark(
|
|
28
|
+
self,
|
|
29
|
+
name: str,
|
|
30
|
+
identifiers: list[str],
|
|
31
|
+
prompts: Optional[list[str | None]] = None,
|
|
32
|
+
prompt_assets: Optional[list[str | None]] = None,
|
|
33
|
+
tags: Optional[list[list[str] | None]] = None,
|
|
34
|
+
) -> RapidataBenchmark:
|
|
32
35
|
"""
|
|
33
36
|
Creates a new benchmark with the given name, identifiers, prompts, and media assets.
|
|
34
37
|
Everything is matched up by the indexes of the lists.
|
|
@@ -41,31 +44,54 @@ class RapidataBenchmarkManager:
|
|
|
41
44
|
prompts: The prompts that will be registered for the benchmark.
|
|
42
45
|
prompt_assets: The prompt assets that will be registered for the benchmark.
|
|
43
46
|
tags: The tags that will be associated with the prompts to use for filtering the leaderboard results. They will NOT be shown to the users.
|
|
47
|
+
|
|
48
|
+
Example:
|
|
49
|
+
```python
|
|
50
|
+
name = "Example Benchmark"
|
|
51
|
+
identifiers = ["id1", "id2", "id3"]
|
|
52
|
+
prompts = ["prompt 1", "prompt 2", "prompt 3"]
|
|
53
|
+
prompt_assets = ["https://assets.rapidata.ai/prompt_1.jpg", "https://assets.rapidata.ai/prompt_2.jpg", "https://assets.rapidata.ai/prompt_3.jpg"]
|
|
54
|
+
tags = [["tag1", "tag2"], ["tag2"], ["tag2", "tag3"]]
|
|
55
|
+
|
|
56
|
+
benchmark = create_new_benchmark(name=name, identifiers=identifiers, prompts=prompts, prompt_assets=prompt_assets, tags=tags)
|
|
57
|
+
```
|
|
44
58
|
"""
|
|
45
59
|
if not isinstance(name, str):
|
|
46
60
|
raise ValueError("Name must be a string.")
|
|
47
|
-
|
|
48
|
-
if prompts and (
|
|
61
|
+
|
|
62
|
+
if prompts and (
|
|
63
|
+
not isinstance(prompts, list)
|
|
64
|
+
or not all(isinstance(prompt, str) or prompt is None for prompt in prompts)
|
|
65
|
+
):
|
|
49
66
|
raise ValueError("Prompts must be a list of strings or None.")
|
|
50
|
-
|
|
51
|
-
if prompt_assets and (
|
|
67
|
+
|
|
68
|
+
if prompt_assets and (
|
|
69
|
+
not isinstance(prompt_assets, list)
|
|
70
|
+
or not all(
|
|
71
|
+
isinstance(asset, str) or asset is None for asset in prompt_assets
|
|
72
|
+
)
|
|
73
|
+
):
|
|
52
74
|
raise ValueError("Media assets must be a list of strings or None.")
|
|
53
|
-
|
|
54
|
-
if not isinstance(identifiers, list) or not all(
|
|
75
|
+
|
|
76
|
+
if not isinstance(identifiers, list) or not all(
|
|
77
|
+
isinstance(identifier, str) for identifier in identifiers
|
|
78
|
+
):
|
|
55
79
|
raise ValueError("Identifiers must be a list of strings.")
|
|
56
|
-
|
|
80
|
+
|
|
57
81
|
if prompts and len(identifiers) != len(prompts):
|
|
58
82
|
raise ValueError("Identifiers and prompts must have the same length.")
|
|
59
|
-
|
|
83
|
+
|
|
60
84
|
if prompt_assets and len(identifiers) != len(prompt_assets):
|
|
61
85
|
raise ValueError("Identifiers and media assets must have the same length.")
|
|
62
|
-
|
|
86
|
+
|
|
63
87
|
if not prompts and not prompt_assets:
|
|
64
|
-
raise ValueError(
|
|
65
|
-
|
|
88
|
+
raise ValueError(
|
|
89
|
+
"At least one of prompts or media assets must be provided."
|
|
90
|
+
)
|
|
91
|
+
|
|
66
92
|
if len(set(identifiers)) != len(identifiers):
|
|
67
93
|
raise ValueError("Identifiers must be unique.")
|
|
68
|
-
|
|
94
|
+
|
|
69
95
|
if tags and len(identifiers) != len(tags):
|
|
70
96
|
raise ValueError("Identifiers and tags must have the same length.")
|
|
71
97
|
|
|
@@ -78,32 +104,49 @@ class RapidataBenchmarkManager:
|
|
|
78
104
|
benchmark = RapidataBenchmark(name, benchmark_result.id, self.__openapi_service)
|
|
79
105
|
|
|
80
106
|
prompts_list = prompts if prompts is not None else [None] * len(identifiers)
|
|
81
|
-
media_assets_list =
|
|
107
|
+
media_assets_list = (
|
|
108
|
+
prompt_assets if prompt_assets is not None else [None] * len(identifiers)
|
|
109
|
+
)
|
|
82
110
|
tags_list = tags if tags is not None else [None] * len(identifiers)
|
|
83
111
|
|
|
84
|
-
for identifier, prompt, asset, tag in zip(
|
|
112
|
+
for identifier, prompt, asset, tag in zip(
|
|
113
|
+
identifiers, prompts_list, media_assets_list, tags_list
|
|
114
|
+
):
|
|
85
115
|
benchmark.add_prompt(identifier, prompt, asset, tag)
|
|
86
116
|
|
|
87
117
|
return benchmark
|
|
88
|
-
|
|
118
|
+
|
|
89
119
|
def get_benchmark_by_id(self, id: str) -> RapidataBenchmark:
|
|
90
120
|
"""
|
|
91
121
|
Returns a benchmark by its ID.
|
|
92
122
|
"""
|
|
93
|
-
benchmark_result =
|
|
94
|
-
|
|
123
|
+
benchmark_result = (
|
|
124
|
+
self.__openapi_service.benchmark_api.benchmark_benchmark_id_get(
|
|
125
|
+
benchmark_id=id
|
|
126
|
+
)
|
|
95
127
|
)
|
|
96
|
-
return RapidataBenchmark(
|
|
97
|
-
|
|
98
|
-
|
|
128
|
+
return RapidataBenchmark(
|
|
129
|
+
benchmark_result.name, benchmark_result.id, self.__openapi_service
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def find_benchmarks(
|
|
133
|
+
self, name: str = "", amount: int = 10
|
|
134
|
+
) -> list[RapidataBenchmark]:
|
|
99
135
|
"""
|
|
100
136
|
Returns a list of benchmarks by their name.
|
|
101
137
|
"""
|
|
102
138
|
benchmark_result = self.__openapi_service.benchmark_api.benchmarks_get(
|
|
103
139
|
QueryModel(
|
|
104
140
|
page=PageInfo(index=1, size=amount),
|
|
105
|
-
filter=RootFilter(
|
|
106
|
-
|
|
141
|
+
filter=RootFilter(
|
|
142
|
+
filters=[Filter(field="Name", operator="Contains", value=name)]
|
|
143
|
+
),
|
|
144
|
+
sortCriteria=[
|
|
145
|
+
SortCriterion(direction="Desc", propertyName="CreatedAt")
|
|
146
|
+
],
|
|
107
147
|
)
|
|
108
148
|
)
|
|
109
|
-
return [
|
|
149
|
+
return [
|
|
150
|
+
RapidataBenchmark(benchmark.name, benchmark.id, self.__openapi_service)
|
|
151
|
+
for benchmark in benchmark_result.items
|
|
152
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .config import rapidata_config
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
class _RapidataConfig:
|
|
2
|
+
def __init__(self):
|
|
3
|
+
self.__maxUploadWorkers: int = 10
|
|
4
|
+
self.__uploadMaxRetries: int = 3
|
|
5
|
+
|
|
6
|
+
@property
|
|
7
|
+
def max_upload_workers(self) -> int:
|
|
8
|
+
return self.__maxUploadWorkers
|
|
9
|
+
|
|
10
|
+
@max_upload_workers.setter
|
|
11
|
+
def max_upload_workers(self, value: int) -> None:
|
|
12
|
+
if value < 1:
|
|
13
|
+
raise ValueError("max_upload_workers must be greater than 0")
|
|
14
|
+
self.__maxUploadWorkers = value
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def upload_max_retries(self) -> int:
|
|
18
|
+
return self.__uploadMaxRetries
|
|
19
|
+
|
|
20
|
+
@upload_max_retries.setter
|
|
21
|
+
def upload_max_retries(self, value: int) -> None:
|
|
22
|
+
if value < 1:
|
|
23
|
+
raise ValueError("upload_max_retries must be greater than 0")
|
|
24
|
+
self.__uploadMaxRetries = value
|
|
25
|
+
|
|
26
|
+
def __str__(self) -> str:
|
|
27
|
+
return f"RapidataConfig(max_upload_workers={self.__maxUploadWorkers}, upload_max_retries={self.__uploadMaxRetries})"
|
|
28
|
+
|
|
29
|
+
def __repr__(self) -> str:
|
|
30
|
+
return self.__str__()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
rapidata_config = _RapidataConfig()
|
|
@@ -2,34 +2,39 @@ import requests
|
|
|
2
2
|
from requests.adapters import HTTPAdapter
|
|
3
3
|
from urllib3.util.retry import Retry
|
|
4
4
|
|
|
5
|
+
from rapidata.rapidata_client.config.config import rapidata_config
|
|
6
|
+
|
|
7
|
+
|
|
5
8
|
class SessionManager:
|
|
6
9
|
_session = None
|
|
7
|
-
|
|
10
|
+
|
|
8
11
|
@classmethod
|
|
9
|
-
def get_session(
|
|
12
|
+
def get_session(
|
|
13
|
+
cls,
|
|
14
|
+
) -> requests.Session:
|
|
10
15
|
"""Get a singleton requests session with retry logic.
|
|
11
16
|
|
|
12
17
|
Returns:
|
|
13
18
|
requests.Session: A singleton requests session with retry logic.
|
|
14
19
|
"""
|
|
15
20
|
if cls._session is None:
|
|
16
|
-
max_retries: int =
|
|
17
|
-
max_workers: int =
|
|
21
|
+
max_retries: int = rapidata_config.upload_max_retries
|
|
22
|
+
max_workers: int = rapidata_config.max_upload_workers
|
|
18
23
|
cls._session = requests.Session()
|
|
19
24
|
retries = Retry(
|
|
20
25
|
total=max_retries,
|
|
21
26
|
backoff_factor=1,
|
|
22
27
|
status_forcelist=[500, 502, 503, 504],
|
|
23
28
|
allowed_methods=["GET"],
|
|
24
|
-
respect_retry_after_header=True
|
|
29
|
+
respect_retry_after_header=True,
|
|
25
30
|
)
|
|
26
31
|
|
|
27
32
|
adapter = HTTPAdapter(
|
|
28
33
|
pool_connections=max_workers * 2,
|
|
29
34
|
pool_maxsize=max_workers * 4,
|
|
30
|
-
max_retries=retries
|
|
35
|
+
max_retries=retries,
|
|
31
36
|
)
|
|
32
|
-
cls._session.mount(
|
|
33
|
-
cls._session.mount(
|
|
37
|
+
cls._session.mount("http://", adapter)
|
|
38
|
+
cls._session.mount("https://", adapter)
|
|
34
39
|
|
|
35
40
|
return cls._session
|
|
@@ -1,25 +1,11 @@
|
|
|
1
|
-
from itertools import zip_longest
|
|
2
|
-
|
|
3
|
-
from rapidata.api_client.models.create_datapoint_from_text_sources_model import (
|
|
4
|
-
CreateDatapointFromTextSourcesModel,
|
|
5
|
-
)
|
|
6
|
-
from rapidata.api_client.models.dataset_dataset_id_datapoints_post_request_metadata_inner import (
|
|
7
|
-
DatasetDatasetIdDatapointsPostRequestMetadataInner,
|
|
8
|
-
)
|
|
9
1
|
from rapidata.rapidata_client.datapoints.datapoint import Datapoint
|
|
10
|
-
from rapidata.rapidata_client.datapoints.
|
|
11
|
-
from rapidata.rapidata_client.datapoints.assets import (
|
|
12
|
-
TextAsset,
|
|
13
|
-
MediaAsset,
|
|
14
|
-
MultiAsset,
|
|
15
|
-
BaseAsset,
|
|
16
|
-
)
|
|
2
|
+
from rapidata.rapidata_client.datapoints.assets import TextAsset, MediaAsset
|
|
17
3
|
from rapidata.service import LocalFileService
|
|
18
4
|
from rapidata.service.openapi_service import OpenAPIService
|
|
19
5
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
20
6
|
from tqdm import tqdm
|
|
21
7
|
|
|
22
|
-
from typing import
|
|
8
|
+
from typing import Generator
|
|
23
9
|
from rapidata.rapidata_client.logging import (
|
|
24
10
|
logger,
|
|
25
11
|
managed_print,
|
|
@@ -30,6 +16,7 @@ import threading
|
|
|
30
16
|
from rapidata.rapidata_client.api.rapidata_exception import (
|
|
31
17
|
suppress_rapidata_error_logging,
|
|
32
18
|
)
|
|
19
|
+
from rapidata.rapidata_client.config.config import rapidata_config
|
|
33
20
|
|
|
34
21
|
|
|
35
22
|
def chunk_list(lst: list, chunk_size: int) -> Generator:
|
|
@@ -52,17 +39,19 @@ class RapidataDataset:
|
|
|
52
39
|
|
|
53
40
|
effective_asset_type = datapoints[0]._get_effective_asset_type()
|
|
54
41
|
|
|
42
|
+
logger.debug(f"Config for datapoint upload: {rapidata_config}")
|
|
43
|
+
|
|
55
44
|
if issubclass(effective_asset_type, MediaAsset):
|
|
56
|
-
return self._add_media_from_paths(
|
|
45
|
+
return self._add_media_from_paths(
|
|
46
|
+
datapoints,
|
|
47
|
+
)
|
|
57
48
|
elif issubclass(effective_asset_type, TextAsset):
|
|
58
49
|
return self._add_texts(datapoints)
|
|
59
50
|
else:
|
|
60
51
|
raise ValueError(f"Unsupported asset type: {effective_asset_type}")
|
|
61
52
|
|
|
62
53
|
def _add_texts(
|
|
63
|
-
self,
|
|
64
|
-
datapoints: list[Datapoint],
|
|
65
|
-
max_workers: int = 10,
|
|
54
|
+
self, datapoints: list[Datapoint]
|
|
66
55
|
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
67
56
|
|
|
68
57
|
def upload_text_datapoint(datapoint: Datapoint, index: int) -> Datapoint:
|
|
@@ -77,7 +66,9 @@ class RapidataDataset:
|
|
|
77
66
|
failed_uploads: list[Datapoint] = []
|
|
78
67
|
|
|
79
68
|
total_uploads = len(datapoints)
|
|
80
|
-
with ThreadPoolExecutor(
|
|
69
|
+
with ThreadPoolExecutor(
|
|
70
|
+
max_workers=rapidata_config.max_upload_workers
|
|
71
|
+
) as executor:
|
|
81
72
|
future_to_datapoint = {
|
|
82
73
|
executor.submit(upload_text_datapoint, datapoint, index=i): datapoint
|
|
83
74
|
for i, datapoint in enumerate(datapoints)
|
|
@@ -104,7 +95,6 @@ class RapidataDataset:
|
|
|
104
95
|
self,
|
|
105
96
|
datapoint: Datapoint,
|
|
106
97
|
index: int,
|
|
107
|
-
max_retries: int = 3,
|
|
108
98
|
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
109
99
|
"""
|
|
110
100
|
Process single upload with retry logic and error tracking.
|
|
@@ -129,7 +119,7 @@ class RapidataDataset:
|
|
|
129
119
|
urls = datapoint.get_urls()
|
|
130
120
|
|
|
131
121
|
last_exception = None
|
|
132
|
-
for attempt in range(
|
|
122
|
+
for attempt in range(rapidata_config.upload_max_retries):
|
|
133
123
|
try:
|
|
134
124
|
with suppress_rapidata_error_logging():
|
|
135
125
|
self.openapi_service.dataset_api.dataset_dataset_id_datapoints_post(
|
|
@@ -146,17 +136,21 @@ class RapidataDataset:
|
|
|
146
136
|
|
|
147
137
|
except Exception as e:
|
|
148
138
|
last_exception = e
|
|
149
|
-
if attempt <
|
|
139
|
+
if attempt < rapidata_config.upload_max_retries - 1:
|
|
150
140
|
# Exponential backoff: wait 1s, then 2s, then 4s
|
|
151
141
|
retry_delay = 2**attempt
|
|
152
142
|
time.sleep(retry_delay)
|
|
153
143
|
logger.debug("Error: %s", str(last_exception))
|
|
154
|
-
logger.debug(
|
|
144
|
+
logger.debug(
|
|
145
|
+
"Retrying %s of %s...",
|
|
146
|
+
attempt + 1,
|
|
147
|
+
rapidata_config.upload_max_retries,
|
|
148
|
+
)
|
|
155
149
|
|
|
156
150
|
# If we get here, all retries failed
|
|
157
151
|
local_failed.append(datapoint)
|
|
158
152
|
tqdm.write(
|
|
159
|
-
f"Upload failed for {datapoint} after {
|
|
153
|
+
f"Upload failed for {datapoint} after {rapidata_config.upload_max_retries} attempts. \nFinal error: \n{str(last_exception)}"
|
|
160
154
|
)
|
|
161
155
|
|
|
162
156
|
return local_successful, local_failed
|
|
@@ -277,7 +271,6 @@ class RapidataDataset:
|
|
|
277
271
|
def _process_uploads_in_chunks(
|
|
278
272
|
self,
|
|
279
273
|
datapoints: list[Datapoint],
|
|
280
|
-
max_workers: int,
|
|
281
274
|
chunk_size: int,
|
|
282
275
|
stop_progress_tracking: threading.Event,
|
|
283
276
|
progress_tracking_error: threading.Event,
|
|
@@ -288,7 +281,6 @@ class RapidataDataset:
|
|
|
288
281
|
Args:
|
|
289
282
|
media_paths: List of assets to upload
|
|
290
283
|
multi_metadata: Optional sequence of sequences of metadata
|
|
291
|
-
max_workers: Maximum number of concurrent workers
|
|
292
284
|
chunk_size: Number of items to process in each batch
|
|
293
285
|
stop_progress_tracking: Event to signal progress tracking to stop
|
|
294
286
|
progress_tracking_error: Event to detect progress tracking errors
|
|
@@ -300,7 +292,9 @@ class RapidataDataset:
|
|
|
300
292
|
failed_uploads: list[Datapoint] = []
|
|
301
293
|
|
|
302
294
|
try:
|
|
303
|
-
with ThreadPoolExecutor(
|
|
295
|
+
with ThreadPoolExecutor(
|
|
296
|
+
max_workers=rapidata_config.max_upload_workers
|
|
297
|
+
) as executor:
|
|
304
298
|
# Process uploads in chunks to avoid overwhelming the system
|
|
305
299
|
for chunk_idx, chunk in enumerate(chunk_list(datapoints, chunk_size)):
|
|
306
300
|
futures = [
|
|
@@ -395,7 +389,6 @@ class RapidataDataset:
|
|
|
395
389
|
def _add_media_from_paths(
|
|
396
390
|
self,
|
|
397
391
|
datapoints: list[Datapoint],
|
|
398
|
-
max_workers: int = 10,
|
|
399
392
|
chunk_size: int = 50,
|
|
400
393
|
progress_poll_interval: float = 0.5,
|
|
401
394
|
) -> tuple[list[Datapoint], list[Datapoint]]:
|
|
@@ -404,10 +397,8 @@ class RapidataDataset:
|
|
|
404
397
|
|
|
405
398
|
Args:
|
|
406
399
|
datapoints: List of Datapoint objects to upload
|
|
407
|
-
max_workers: Maximum number of concurrent upload workers
|
|
408
400
|
chunk_size: Number of items to process in each batch
|
|
409
401
|
progress_poll_interval: Time in seconds between progress checks
|
|
410
|
-
|
|
411
402
|
Returns:
|
|
412
403
|
tuple[list[Datapoint], list[Datapoint]]: Lists of successful and failed datapoints
|
|
413
404
|
|
|
@@ -435,7 +426,6 @@ class RapidataDataset:
|
|
|
435
426
|
try:
|
|
436
427
|
successful_uploads, failed_uploads = self._process_uploads_in_chunks(
|
|
437
428
|
datapoints,
|
|
438
|
-
max_workers,
|
|
439
429
|
chunk_size,
|
|
440
430
|
stop_progress_tracking,
|
|
441
431
|
progress_tracking_error,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
rapidata/__init__.py,sha256=
|
|
1
|
+
rapidata/__init__.py,sha256=EFlikunMBEfpqFElj3ZTxl3AHZWm9lycdg6ucJQHsx0,918
|
|
2
2
|
rapidata/api_client/__init__.py,sha256=fb2lqv3sj48wAgarp3g6hvtTgd7bfI01DJJLuBZQnFI,34850
|
|
3
3
|
rapidata/api_client/api/__init__.py,sha256=dGnSE9oPO_ahGh-E1jtw4_VuM_vQueQFuv0IVMQo6uo,1546
|
|
4
4
|
rapidata/api_client/api/benchmark_api.py,sha256=bC8hAPgHIDU5u1e0loWPWnZX33BW6gsAR8oc5199q2k,129777
|
|
@@ -538,7 +538,7 @@ rapidata/api_client/models/workflow_state.py,sha256=5LAK1se76RCoozeVB6oxMPb8p_5b
|
|
|
538
538
|
rapidata/api_client/models/zip_entry_file_wrapper.py,sha256=06CoNJD3x511K3rnSmkrwwhc9GbQxwxF-c0ldOyJbAs,4240
|
|
539
539
|
rapidata/api_client/rest.py,sha256=rtIMcgINZOUaDFaJIinJkXRSddNJmXvMRMfgO2Ezk2o,10835
|
|
540
540
|
rapidata/api_client_README.md,sha256=JnoGKzDZ5evkJrn1YtDF21Mz6Z7b1pIDXrkmjb1mNfQ,62660
|
|
541
|
-
rapidata/rapidata_client/__init__.py,sha256=
|
|
541
|
+
rapidata/rapidata_client/__init__.py,sha256=XfJ9ixxhDus6o5thb2FE1xCPi7uz0raHvXgd-LMuLzE,1216
|
|
542
542
|
rapidata/rapidata_client/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
543
543
|
rapidata/rapidata_client/api/rapidata_exception.py,sha256=4BxfpOa701V5Ph17qzl9QNmBRotV93TwJCcA5E1Ngwc,4631
|
|
544
544
|
rapidata/rapidata_client/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -546,9 +546,11 @@ rapidata/rapidata_client/benchmark/_detail_mapper.py,sha256=HmzJwR2dojs0c2PaEJ5l
|
|
|
546
546
|
rapidata/rapidata_client/benchmark/leaderboard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
547
547
|
rapidata/rapidata_client/benchmark/leaderboard/rapidata_leaderboard.py,sha256=xRNXw__K4-4wb4UI-EgLyJRwGO5OkFdoN3A1S3MkdF4,6092
|
|
548
548
|
rapidata/rapidata_client/benchmark/participant/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
549
|
-
rapidata/rapidata_client/benchmark/participant/_participant.py,sha256=
|
|
549
|
+
rapidata/rapidata_client/benchmark/participant/_participant.py,sha256=ZCsNW_ea4h-seS4Vwq8NiKGyVfo2_RCiQE-hlWIGnms,4399
|
|
550
550
|
rapidata/rapidata_client/benchmark/rapidata_benchmark.py,sha256=Z0jT9hiWyS3km0BwtGXbdmtopdnecf1z-ucDdwg06Y0,14793
|
|
551
|
-
rapidata/rapidata_client/benchmark/rapidata_benchmark_manager.py,sha256=
|
|
551
|
+
rapidata/rapidata_client/benchmark/rapidata_benchmark_manager.py,sha256=75xtTyXRN5Q_zQ0m2NALmkOI0mKywbK6EOOvi3npoCI,6039
|
|
552
|
+
rapidata/rapidata_client/config/__init__.py,sha256=XQQ8nkyU6tbz5m9t5pBjNGxgxiHiBgQj1KTTpWExdmI,36
|
|
553
|
+
rapidata/rapidata_client/config/config.py,sha256=okyuYnoSxPF2QUwXEWAE1hPYSRZ1ScarbHfAzlE2oBI,1020
|
|
552
554
|
rapidata/rapidata_client/country_codes/__init__.py,sha256=FB9Dcks44J6C6YBSYmTmNZ71tE130x6NO_3aLJ8fKzQ,40
|
|
553
555
|
rapidata/rapidata_client/country_codes/country_codes.py,sha256=ePHqeb7y9DWQZAnddBzPx1puYBcrgUjdR2sbFijuFD8,283
|
|
554
556
|
rapidata/rapidata_client/datapoints/__init__.py,sha256=PRt3e8-qJigagDlOoE3K0W62M40-1WOEfAycd6lFrj4,242
|
|
@@ -556,7 +558,7 @@ rapidata/rapidata_client/datapoints/assets/__init__.py,sha256=eQkqUrYFME1FCxPY2X
|
|
|
556
558
|
rapidata/rapidata_client/datapoints/assets/_base_asset.py,sha256=B2YWH1NgaeYUYHDW3OPpHM_bqawHbH4EjnRCE2BYwiM,298
|
|
557
559
|
rapidata/rapidata_client/datapoints/assets/_media_asset.py,sha256=T7M_6Xo1FZooSGma0sXzazQ3nKMyfBD7gH7nt56tnms,10584
|
|
558
560
|
rapidata/rapidata_client/datapoints/assets/_multi_asset.py,sha256=EBsTShUbJdxfslaxmdPlBORtzFftXfnt3m28EMiL12s,1818
|
|
559
|
-
rapidata/rapidata_client/datapoints/assets/_sessions.py,sha256=
|
|
561
|
+
rapidata/rapidata_client/datapoints/assets/_sessions.py,sha256=ANFmb6JvFGtS5LbGAuBYR9Y9NvbwhwV-1PmdiTR6q_I,1238
|
|
560
562
|
rapidata/rapidata_client/datapoints/assets/_text_asset.py,sha256=bF3S624oDpZ-TmnPjkciJStlV61U24AbxZEBpyS-L1Y,807
|
|
561
563
|
rapidata/rapidata_client/datapoints/assets/data_type_enum.py,sha256=v6gR2Wqenb9H_Bs6dKmUrkbjYRDD3tZmeoL5f8LlAcM,239
|
|
562
564
|
rapidata/rapidata_client/datapoints/datapoint.py,sha256=4XnRVj6zC4O3bEG0ifp97XcnAFmC9NgICkh__UEBEBY,4756
|
|
@@ -594,7 +596,7 @@ rapidata/rapidata_client/logging/__init__.py,sha256=4gLxePW8TvgYDZmPWMcf6fA8bEyu
|
|
|
594
596
|
rapidata/rapidata_client/logging/logger.py,sha256=9vULXUizGObQeqMY-CryiAQsq8xDZw0ChLhvV8oa99s,3907
|
|
595
597
|
rapidata/rapidata_client/logging/output_manager.py,sha256=AmSVZ2emVW5UWgOiNqkXNVRItsvd5Ox0hsIoZQhYYYo,653
|
|
596
598
|
rapidata/rapidata_client/order/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
597
|
-
rapidata/rapidata_client/order/_rapidata_dataset.py,sha256=
|
|
599
|
+
rapidata/rapidata_client/order/_rapidata_dataset.py,sha256=DK2roDgqzKM1_wt8L23idItVTcL767LOWNolLLF5VsA,17667
|
|
598
600
|
rapidata/rapidata_client/order/_rapidata_order_builder.py,sha256=UMkCKZnpEc02ax1E2hIE6LAnnq7sgp1Sjxg0bpYz8Jo,12662
|
|
599
601
|
rapidata/rapidata_client/order/rapidata_order.py,sha256=oKAKs7PgOhCyVIYbKPcdUTwSttvVpoFx8_KAe2V6veI,13714
|
|
600
602
|
rapidata/rapidata_client/order/rapidata_order_manager.py,sha256=vq0wG5BSGeYB2LO_fwzOrrR40mKUmkcMPeBZF1pwgoY,38055
|
|
@@ -651,7 +653,7 @@ rapidata/service/__init__.py,sha256=s9bS1AJZaWIhLtJX_ZA40_CK39rAAkwdAmymTMbeWl4,
|
|
|
651
653
|
rapidata/service/credential_manager.py,sha256=ULpsE-nGz4VlvKQt0LDqqsgIjwd0N1rhEDWFs5mAdPc,8699
|
|
652
654
|
rapidata/service/local_file_service.py,sha256=pgorvlWcx52Uh3cEG6VrdMK_t__7dacQ_5AnfY14BW8,877
|
|
653
655
|
rapidata/service/openapi_service.py,sha256=aY53siftbo_7YwUN-4FLkfEzvVmTfbHTZ8b4wrkpG6E,5486
|
|
654
|
-
rapidata-2.35.
|
|
655
|
-
rapidata-2.35.
|
|
656
|
-
rapidata-2.35.
|
|
657
|
-
rapidata-2.35.
|
|
656
|
+
rapidata-2.35.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
657
|
+
rapidata-2.35.3.dist-info/METADATA,sha256=Ppr4WV7lBr1jDmA1dO1tmBCMREs_vhpa4OqYdjtEluA,1264
|
|
658
|
+
rapidata-2.35.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
659
|
+
rapidata-2.35.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|