atlan-application-sdk 0.1.1rc35__py3-none-any.whl → 0.1.1rc37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,139 @@
1
+ """Redis lock interceptor for Temporal workflows.
2
+
3
+ Manages distributed locks for activities decorated with @needs_lock using
4
+ separate lock acquisition and release activities to avoid workflow deadlocks.
5
+ """
6
+
7
+ from datetime import timedelta
8
+ from typing import Any, Dict, Optional, Type
9
+
10
+ from temporalio import workflow
11
+ from temporalio.common import RetryPolicy
12
+ from temporalio.worker import (
13
+ Interceptor,
14
+ StartActivityInput,
15
+ WorkflowInboundInterceptor,
16
+ WorkflowInterceptorClassInput,
17
+ WorkflowOutboundInterceptor,
18
+ )
19
+
20
+ from application_sdk.common.error_codes import WorkflowError
21
+ from application_sdk.constants import (
22
+ APPLICATION_NAME,
23
+ IS_LOCKING_DISABLED,
24
+ LOCK_METADATA_KEY,
25
+ )
26
+ from application_sdk.observability.logger_adaptor import get_logger
27
+
28
+ logger = get_logger(__name__)
29
+
30
+
31
+ class RedisLockInterceptor(Interceptor):
32
+ """Main interceptor class for Redis distributed locking."""
33
+
34
+ def __init__(self, activities: Dict[str, Any]):
35
+ """Initialize Redis lock interceptor.
36
+
37
+ Args:
38
+ activities: Dictionary mapping activity names to activity functions
39
+ """
40
+ self.activities = activities
41
+
42
+ def workflow_interceptor_class(
43
+ self, input: WorkflowInterceptorClassInput
44
+ ) -> Optional[Type[WorkflowInboundInterceptor]]:
45
+ activities = self.activities
46
+
47
+ class RedisLockWorkflowInboundInterceptor(WorkflowInboundInterceptor):
48
+ """Inbound interceptor that manages Redis locks for activities."""
49
+
50
+ def init(self, outbound: WorkflowOutboundInterceptor) -> None:
51
+ """Initialize with Redis lock outbound interceptor."""
52
+ lock_outbound = RedisLockOutboundInterceptor(outbound, activities)
53
+ super().init(lock_outbound)
54
+
55
+ return RedisLockWorkflowInboundInterceptor
56
+
57
+
58
+ class RedisLockOutboundInterceptor(WorkflowOutboundInterceptor):
59
+ """Outbound interceptor that acquires Redis locks before activity execution."""
60
+
61
+ def __init__(self, next: WorkflowOutboundInterceptor, activities: Dict[str, Any]):
62
+ super().__init__(next)
63
+ self.activities = activities
64
+
65
+ async def start_activity( # type: ignore[override]
66
+ self, input: StartActivityInput
67
+ ) -> workflow.ActivityHandle[Any]:
68
+ """Start activity with distributed lock if required."""
69
+
70
+ # Check if activity needs locking
71
+ activity_fn = self.activities.get(input.activity)
72
+ if (
73
+ not activity_fn
74
+ or not hasattr(activity_fn, LOCK_METADATA_KEY)
75
+ or IS_LOCKING_DISABLED
76
+ ):
77
+ return await self.next.start_activity(input)
78
+
79
+ lock_config = getattr(activity_fn, LOCK_METADATA_KEY)
80
+ lock_name = lock_config.get("lock_name", input.activity)
81
+ max_locks = lock_config.get("max_locks", 5)
82
+ if not input.schedule_to_close_timeout:
83
+ logger.error(
84
+ f"Activity '{input.activity}' with @needs_lock decorator requires schedule_to_close_timeout"
85
+ )
86
+ raise WorkflowError(
87
+ f"{WorkflowError.WORKFLOW_CONFIG_ERROR}: Activity '{input.activity}' with @needs_lock decorator must be called with schedule_to_close_timeout parameter. "
88
+ f"Example: workflow.execute_activity('{input.activity}', schedule_to_close_timeout=timedelta(minutes=10))"
89
+ )
90
+ ttl_seconds = int(input.schedule_to_close_timeout.total_seconds())
91
+
92
+ # Orchestrate lock acquisition -> business activity -> lock release
93
+ return await self._execute_with_lock_orchestration(
94
+ input, lock_name, max_locks, ttl_seconds
95
+ )
96
+
97
+ async def _execute_with_lock_orchestration(
98
+ self,
99
+ input: StartActivityInput,
100
+ lock_name: str,
101
+ max_locks: int,
102
+ ttl_seconds: int,
103
+ ) -> workflow.ActivityHandle[Any]:
104
+ """Execute activity with distributed lock orchestration."""
105
+ owner_id = f"{APPLICATION_NAME}:{workflow.info().run_id}"
106
+ lock_result = None
107
+
108
+ try:
109
+ # Step 1: Acquire lock via dedicated activity (can take >2s safely)
110
+ start_to_close_timeout = workflow.info().execution_timeout
111
+ lock_result = await workflow.execute_activity(
112
+ "acquire_distributed_lock",
113
+ args=[lock_name, max_locks, ttl_seconds, owner_id],
114
+ start_to_close_timeout=start_to_close_timeout,
115
+ retry_policy=RetryPolicy(maximum_attempts=1),
116
+ )
117
+
118
+ logger.debug(f"Lock acquired: {lock_result}, executing {input.activity}")
119
+
120
+ # Step 2: Execute the business activity and return its handle
121
+ return await self.next.start_activity(input)
122
+
123
+ finally:
124
+ # Step 3: Release lock (fire-and-forget with short timeout)
125
+ if lock_result is not None:
126
+ try:
127
+ await workflow.execute_local_activity(
128
+ "release_distributed_lock",
129
+ args=[lock_result["resource_id"], lock_result["owner_id"]],
130
+ start_to_close_timeout=timedelta(seconds=5),
131
+ retry_policy=RetryPolicy(maximum_attempts=1),
132
+ )
133
+ logger.debug(f"Lock released: {lock_result['resource_id']}")
134
+ except Exception as e:
135
+ # Silent failure - TTL will handle cleanup
136
+ logger.warning(
137
+ f"Lock release failed for {lock_result['resource_id']}: {e}. "
138
+ f"TTL will handle cleanup."
139
+ )
@@ -13,6 +13,7 @@ from typing import (
13
13
  Dict,
14
14
  Generator,
15
15
  List,
16
+ Literal,
16
17
  Optional,
17
18
  Union,
18
19
  cast,
@@ -31,7 +32,7 @@ logger = get_logger(__name__)
31
32
  activity.logger = logger
32
33
 
33
34
  if TYPE_CHECKING:
34
- import daft
35
+ import daft # type: ignore
35
36
  import pandas as pd
36
37
 
37
38
 
@@ -52,6 +53,27 @@ class Output(ABC):
52
53
  output_prefix: str
53
54
  total_record_count: int
54
55
  chunk_count: int
56
+ statistics: List[int] = []
57
+
58
+ def estimate_dataframe_file_size(
59
+ self, dataframe: "pd.DataFrame", file_type: Literal["json", "parquet"]
60
+ ) -> int:
61
+ """Estimate File size of a DataFrame by sampling a few records."""
62
+ if len(dataframe) == 0:
63
+ return 0
64
+
65
+ # Sample up to 10 records to estimate average size
66
+ sample_size = min(10, len(dataframe))
67
+ sample = dataframe.head(sample_size)
68
+ if file_type == "json":
69
+ sample_file = sample.to_json(orient="records", lines=True)
70
+ else:
71
+ sample_file = sample.to_parquet(index=False, compression="snappy")
72
+ if sample_file is not None:
73
+ avg_record_size = len(sample_file) / sample_size
74
+ return int(avg_record_size * len(dataframe))
75
+
76
+ return 0
55
77
 
56
78
  def process_null_fields(
57
79
  self,
@@ -217,6 +239,7 @@ class Output(ABC):
217
239
  statistics = {
218
240
  "total_record_count": self.total_record_count,
219
241
  "chunk_count": self.chunk_count,
242
+ "partitions": self.statistics,
220
243
  }
221
244
 
222
245
  # Write the statistics to a json file
@@ -6,6 +6,7 @@ import orjson
6
6
  from temporalio import activity
7
7
 
8
8
  from application_sdk.activities.common.utils import get_object_store_prefix
9
+ from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
9
10
  from application_sdk.observability.logger_adaptor import get_logger
10
11
  from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
11
12
  from application_sdk.outputs import Output
@@ -15,7 +16,7 @@ logger = get_logger(__name__)
15
16
  activity.logger = logger
16
17
 
17
18
  if TYPE_CHECKING:
18
- import daft
19
+ import daft # type: ignore
19
20
  import pandas as pd
20
21
 
21
22
 
@@ -32,7 +33,7 @@ def path_gen(chunk_start: int | None, chunk_count: int) -> str:
32
33
  if chunk_start is None:
33
34
  return f"{str(chunk_count)}.json"
34
35
  else:
35
- return f"{str(chunk_start+chunk_count)}.json"
36
+ return f"chunk-{chunk_start}-part{chunk_count}.json"
36
37
 
37
38
 
38
39
  def convert_datetime_to_epoch(data: Any) -> Any:
@@ -124,6 +125,10 @@ class JsonOutput(Output):
124
125
  self.chunk_size = chunk_size or 100000
125
126
  self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
126
127
  self.current_buffer_size = 0
128
+ self.current_buffer_size_bytes = 0 # Track estimated buffer size in bytes
129
+ self.max_file_size_bytes = int(
130
+ DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
131
+ ) # 90% of DAPR limit as safety buffer
127
132
  self.path_gen = path_gen
128
133
  self.start_marker = start_marker
129
134
  self.end_marker = end_marker
@@ -172,8 +177,21 @@ class JsonOutput(Output):
172
177
  ]
173
178
 
174
179
  for chunk in chunks:
180
+ # Estimate size of this chunk
181
+ chunk_size_bytes = self.estimate_dataframe_file_size(chunk, "json")
182
+
183
+ # Check if adding this chunk would exceed size limit
184
+ if (
185
+ self.current_buffer_size_bytes + chunk_size_bytes
186
+ > self.max_file_size_bytes
187
+ and self.current_buffer_size > 0
188
+ ):
189
+ # Flush current buffer before adding this chunk
190
+ await self._flush_buffer()
191
+
175
192
  self.buffer.append(chunk)
176
193
  self.current_buffer_size += len(chunk)
194
+ self.current_buffer_size_bytes += chunk_size_bytes
177
195
 
178
196
  if self.current_buffer_size >= partition:
179
197
  await self._flush_buffer()
@@ -237,45 +255,19 @@ class JsonOutput(Output):
237
255
  row, preserve_fields, null_to_empty_dict_fields
238
256
  )
239
257
  # Serialize the row and add it to the buffer
240
- buffer.append(
241
- orjson.dumps(cleaned_row, option=orjson.OPT_APPEND_NEWLINE).decode(
242
- "utf-8"
243
- )
244
- )
245
-
246
- # If the buffer reaches the specified size, write it to the file
247
- if self.chunk_size and len(buffer) >= self.chunk_size:
248
- self.chunk_count += 1
249
- output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count)}"
250
- with open(output_file_name, "w") as f:
251
- f.writelines(buffer)
252
- buffer.clear() # Clear the buffer
253
-
254
- # Record chunk metrics
255
- self.metrics.record_metric(
256
- name="json_chunks_written",
257
- value=1,
258
- metric_type=MetricType.COUNTER,
259
- labels={"type": "daft"},
260
- description="Number of chunks written to JSON files",
261
- )
258
+ serialized_row = orjson.dumps(
259
+ cleaned_row, option=orjson.OPT_APPEND_NEWLINE
260
+ ).decode("utf-8")
261
+ buffer.append(serialized_row)
262
+ self.current_buffer_size_bytes += len(serialized_row)
263
+ if (self.chunk_size and len(buffer) >= self.chunk_size) or (
264
+ self.current_buffer_size_bytes > self.max_file_size_bytes
265
+ ):
266
+ await self.flush_daft_buffer(buffer)
262
267
 
263
268
  # Write any remaining rows in the buffer
264
269
  if buffer:
265
- self.chunk_count += 1
266
- output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count)}"
267
- with open(output_file_name, "w") as f:
268
- f.writelines(buffer)
269
- buffer.clear()
270
-
271
- # Record chunk metrics
272
- self.metrics.record_metric(
273
- name="json_chunks_written",
274
- value=1,
275
- metric_type=MetricType.COUNTER,
276
- labels={"type": "daft"},
277
- description="Number of chunks written to JSON files",
278
- )
270
+ await self.flush_daft_buffer(buffer)
279
271
 
280
272
  # Record metrics for successful write
281
273
  self.metrics.record_metric(
@@ -303,6 +295,32 @@ class JsonOutput(Output):
303
295
  )
304
296
  logger.error(f"Error writing daft dataframe to json: {str(e)}")
305
297
 
298
+ async def flush_daft_buffer(self, buffer: List[str]):
299
+ """Flush the current buffer to a JSON file.
300
+
301
+ This method combines all DataFrames in the buffer, writes them to a JSON file,
302
+ and uploads the file to the object store.
303
+ """
304
+ self.chunk_count += 1
305
+ output_file_name = (
306
+ f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count)}"
307
+ )
308
+ with open(output_file_name, "w") as f:
309
+ f.writelines(buffer)
310
+ buffer.clear() # Clear the buffer
311
+
312
+ self.current_buffer_size = 0
313
+ self.current_buffer_size_bytes = 0
314
+
315
+ # Record chunk metrics
316
+ self.metrics.record_metric(
317
+ name="json_chunks_written",
318
+ value=1,
319
+ metric_type=MetricType.COUNTER,
320
+ labels={"type": "daft"},
321
+ description="Number of chunks written to JSON files",
322
+ )
323
+
306
324
  async def _flush_buffer(self):
307
325
  """Flush the current buffer to a JSON file.
308
326
 
@@ -353,6 +371,7 @@ class JsonOutput(Output):
353
371
 
354
372
  self.buffer.clear()
355
373
  self.current_buffer_size = 0
374
+ self.current_buffer_size_bytes = 0
356
375
 
357
376
  except Exception as e:
358
377
  # Record metrics for failed write
@@ -1,9 +1,10 @@
1
1
  import os
2
- from typing import TYPE_CHECKING, Literal, Optional
2
+ from typing import TYPE_CHECKING, List, Literal, Optional, Union
3
3
 
4
4
  from temporalio import activity
5
5
 
6
6
  from application_sdk.activities.common.utils import get_object_store_prefix
7
+ from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
7
8
  from application_sdk.observability.logger_adaptor import get_logger
8
9
  from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
9
10
  from application_sdk.outputs import Output
@@ -13,7 +14,7 @@ logger = get_logger(__name__)
13
14
  activity.logger = logger
14
15
 
15
16
  if TYPE_CHECKING:
16
- import daft
17
+ import daft # type: ignore
17
18
  import pandas as pd
18
19
 
19
20
 
@@ -46,6 +47,7 @@ class ParquetOutput(Output):
46
47
  typename: Optional[str] = None,
47
48
  write_mode: Literal["append", "overwrite", "overwrite-partitions"] = "append",
48
49
  chunk_size: Optional[int] = 100000,
50
+ buffer_size: Optional[int] = 100000,
49
51
  total_record_count: int = 0,
50
52
  chunk_count: int = 0,
51
53
  chunk_start: Optional[int] = None,
@@ -78,11 +80,19 @@ class ParquetOutput(Output):
78
80
  self.typename = typename
79
81
  self.write_mode = write_mode
80
82
  self.chunk_size = chunk_size
83
+ self.buffer_size = buffer_size
84
+ self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
81
85
  self.total_record_count = total_record_count
82
86
  self.chunk_count = chunk_count
87
+ self.current_buffer_size = 0
88
+ self.current_buffer_size_bytes = 0 # Track estimated buffer size in bytes
89
+ self.max_file_size_bytes = int(
90
+ DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
91
+ ) # 90% of DAPR limit as safety buffer
83
92
  self.chunk_start = chunk_start
84
93
  self.start_marker = start_marker
85
94
  self.end_marker = end_marker
95
+ self.statistics = []
86
96
  self.metrics = get_metrics()
87
97
 
88
98
  # Create output directory
@@ -117,7 +127,7 @@ class ParquetOutput(Output):
117
127
  if chunk_start is None:
118
128
  return f"{str(chunk_count)}.parquet"
119
129
  else:
120
- return f"{str(chunk_start+chunk_count)}.parquet"
130
+ return f"chunk-{str(chunk_start)}-part{str(chunk_count)}.parquet"
121
131
 
122
132
  async def write_dataframe(self, dataframe: "pd.DataFrame"):
123
133
  """Write a pandas DataFrame to Parquet files and upload to object store.
@@ -126,20 +136,46 @@ class ParquetOutput(Output):
126
136
  dataframe (pd.DataFrame): The DataFrame to write.
127
137
  """
128
138
  try:
139
+ chunk_part = 0
129
140
  if len(dataframe) == 0:
130
141
  return
131
142
 
132
- # Update counters
133
- self.chunk_count += 1
134
- self.total_record_count += len(dataframe)
135
- file_path = f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count, self.start_marker, self.end_marker)}"
136
-
137
- # Write the dataframe to parquet using pandas native method
138
- dataframe.to_parquet(
139
- file_path,
140
- index=False,
141
- compression="snappy", # Using snappy compression by default
143
+ # Split the DataFrame into chunks
144
+ partition = (
145
+ self.chunk_size
146
+ if self.chunk_start is None
147
+ else min(self.chunk_size, self.buffer_size)
142
148
  )
149
+ chunks = [
150
+ dataframe[i : i + partition] # type: ignore
151
+ for i in range(0, len(dataframe), partition)
152
+ ]
153
+
154
+ for chunk in chunks:
155
+ # Estimate size of this chunk
156
+ chunk_size_bytes = self.estimate_dataframe_file_size(chunk, "parquet")
157
+
158
+ # Check if adding this chunk would exceed size limit
159
+ if (
160
+ self.current_buffer_size_bytes + chunk_size_bytes
161
+ > self.max_file_size_bytes
162
+ and self.current_buffer_size > 0
163
+ ):
164
+ # Flush current buffer before adding this chunk
165
+ chunk_part += 1
166
+ await self._flush_buffer(chunk_part)
167
+
168
+ self.buffer.append(chunk)
169
+ self.current_buffer_size += len(chunk)
170
+ self.current_buffer_size_bytes += chunk_size_bytes
171
+
172
+ if self.current_buffer_size >= partition: # type: ignore
173
+ chunk_part += 1
174
+ await self._flush_buffer(chunk_part)
175
+
176
+ if self.buffer and self.current_buffer_size > 0:
177
+ chunk_part += 1
178
+ await self._flush_buffer(chunk_part)
143
179
 
144
180
  # Record metrics for successful write
145
181
  self.metrics.record_metric(
@@ -159,11 +195,8 @@ class ParquetOutput(Output):
159
195
  description="Number of chunks written to Parquet files",
160
196
  )
161
197
 
162
- # Upload the file to object store
163
- await ObjectStore.upload_file(
164
- source=file_path,
165
- destination=get_object_store_prefix(file_path),
166
- )
198
+ self.chunk_count += 1
199
+ self.statistics.append(chunk_part)
167
200
  except Exception as e:
168
201
  # Record metrics for failed write
169
202
  self.metrics.record_metric(
@@ -245,3 +278,68 @@ class ParquetOutput(Output):
245
278
  str: The full path of the output file.
246
279
  """
247
280
  return self.output_path
281
+
282
+ async def _flush_buffer(self, chunk_part):
283
+ """Flush the current buffer to a Parquet file.
284
+
285
+ This method combines all DataFrames in the buffer, writes them to a Parquet file,
286
+ and uploads the file to the object store.
287
+
288
+ Note:
289
+ If the buffer is empty or has no records, the method returns without writing.
290
+ """
291
+ import pandas as pd
292
+
293
+ if not self.buffer or not self.current_buffer_size:
294
+ return
295
+
296
+ if not all(isinstance(df, pd.DataFrame) for df in self.buffer):
297
+ raise TypeError(
298
+ "_flush_buffer encountered non-DataFrame elements in buffer. This should not happen."
299
+ )
300
+
301
+ try:
302
+ # Now it's safe to cast for pd.concat
303
+ pd_buffer: List[pd.DataFrame] = self.buffer # type: ignore
304
+ combined_dataframe = pd.concat(pd_buffer)
305
+
306
+ # Write DataFrame to Parquet file
307
+ if not combined_dataframe.empty:
308
+ self.total_record_count += len(combined_dataframe)
309
+ output_file_name = (
310
+ f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
311
+ )
312
+ combined_dataframe.to_parquet(
313
+ output_file_name, index=False, compression="snappy"
314
+ )
315
+
316
+ # Record chunk metrics
317
+ self.metrics.record_metric(
318
+ name="parquet_chunks_written",
319
+ value=1,
320
+ metric_type=MetricType.COUNTER,
321
+ labels={"type": "pandas"},
322
+ description="Number of chunks written to Parquet files",
323
+ )
324
+
325
+ # Push the file to the object store
326
+ await ObjectStore.upload_file(
327
+ source=output_file_name,
328
+ destination=get_object_store_prefix(output_file_name),
329
+ )
330
+
331
+ self.buffer.clear()
332
+ self.current_buffer_size = 0
333
+ self.current_buffer_size_bytes = 0
334
+
335
+ except Exception as e:
336
+ # Record metrics for failed write
337
+ self.metrics.record_metric(
338
+ name="parquet_write_errors",
339
+ value=1,
340
+ metric_type=MetricType.COUNTER,
341
+ labels={"type": "pandas", "error": str(e)},
342
+ description="Number of errors while writing to Parquet files",
343
+ )
344
+ logger.error(f"Error flushing buffer to parquet: {str(e)}")
345
+ raise e
@@ -2,4 +2,4 @@
2
2
  Version information for the application_sdk package.
3
3
  """
4
4
 
5
- __version__ = "0.1.1rc35"
5
+ __version__ = "0.1.1rc37"
@@ -107,7 +107,11 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
107
107
  activity_statistics = ActivityStatistics.model_validate(raw_statistics)
108
108
  transform_activities: List[Any] = []
109
109
 
110
- if activity_statistics is None or activity_statistics.chunk_count == 0:
110
+ if (
111
+ activity_statistics is None
112
+ or activity_statistics.chunk_count == 0
113
+ or not activity_statistics.partitions
114
+ ):
111
115
  # to handle the case where the fetch_fn returns None or no chunks
112
116
  return
113
117
 
@@ -115,7 +119,9 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
115
119
  raise ValueError("Invalid typename")
116
120
 
117
121
  batches, chunk_starts = self.get_transform_batches(
118
- activity_statistics.chunk_count, activity_statistics.typename
122
+ activity_statistics.chunk_count,
123
+ activity_statistics.typename,
124
+ activity_statistics.partitions,
119
125
  )
120
126
 
121
127
  for i in range(len(batches)):
@@ -144,7 +150,9 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
144
150
  total_record_count += metadata_model.total_record_count
145
151
  chunk_count += metadata_model.chunk_count
146
152
 
147
- def get_transform_batches(self, chunk_count: int, typename: str):
153
+ def get_transform_batches(
154
+ self, chunk_count: int, typename: str, partitions: List[int]
155
+ ):
148
156
  """Get batches for parallel transformation processing.
149
157
 
150
158
  Args:
@@ -159,12 +167,17 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
159
167
  batches: List[List[str]] = []
160
168
  chunk_start_numbers: List[int] = []
161
169
 
162
- for i in range(chunk_count):
170
+ for i, partition in enumerate(partitions):
163
171
  # Track starting chunk number (which is just i)
164
172
  chunk_start_numbers.append(i)
165
173
 
166
174
  # Each batch contains exactly one chunk
167
- batches.append([f"{typename}/{i+1}.json"])
175
+ batches.append(
176
+ [
177
+ f"{typename}/chunk-{i}-part{file+1}.parquet"
178
+ for file in range(partition)
179
+ ]
180
+ )
168
181
 
169
182
  return batches, chunk_start_numbers
170
183
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: atlan-application-sdk
3
- Version: 0.1.1rc35
3
+ Version: 0.1.1rc37
4
4
  Summary: Atlan Application SDK is a Python library for developing applications on the Atlan Platform
5
5
  Project-URL: Repository, https://github.com/atlanhq/application-sdk
6
6
  Project-URL: Documentation, https://github.com/atlanhq/application-sdk/README.md
@@ -26,12 +26,14 @@ Requires-Dist: fastapi[standard]>=0.115.0
26
26
  Requires-Dist: loguru>=0.7.3
27
27
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
28
28
  Requires-Dist: psutil>=7.0.0
29
- Requires-Dist: pyatlan>=8.0.0
29
+ Requires-Dist: pyatlan>=8.0.2
30
30
  Requires-Dist: pydantic>=2.10.6
31
31
  Requires-Dist: python-dotenv>=1.1.0
32
32
  Requires-Dist: uvloop>=0.21.0; sys_platform != 'win32'
33
33
  Provides-Extra: daft
34
34
  Requires-Dist: daft>=0.4.12; extra == 'daft'
35
+ Provides-Extra: distributed-lock
36
+ Requires-Dist: redis[hiredis]>=5.2.0; extra == 'distributed-lock'
35
37
  Provides-Extra: iam-auth
36
38
  Requires-Dist: boto3>=1.38.6; extra == 'iam-auth'
37
39
  Provides-Extra: iceberg