atlan-application-sdk 0.1.1rc43__py3-none-any.whl → 0.1.1rc45__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,13 @@
1
+ import inspect
1
2
  import os
3
+ import shutil
2
4
  from enum import Enum
3
- from typing import TYPE_CHECKING, List, Optional, Union
5
+ from typing import TYPE_CHECKING, AsyncGenerator, Generator, List, Optional, Union, cast
4
6
 
5
7
  from temporalio import activity
6
8
 
7
9
  from application_sdk.activities.common.utils import get_object_store_prefix
10
+ from application_sdk.common.dataframe_utils import is_empty_dataframe
8
11
  from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
9
12
  from application_sdk.observability.logger_adaptor import get_logger
10
13
  from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
@@ -35,57 +38,57 @@ class ParquetOutput(Output):
35
38
 
36
39
  Attributes:
37
40
  output_path (str): Base path where Parquet files will be written.
38
- output_prefix (str): Prefix for files when uploading to object store.
39
41
  output_suffix (str): Suffix for output files.
40
42
  typename (Optional[str]): Type name of the entity e.g database, schema, table.
41
43
  chunk_size (int): Maximum number of records per chunk.
42
44
  total_record_count (int): Total number of records processed.
43
45
  chunk_count (int): Number of chunks created.
44
46
  chunk_start (Optional[int]): Starting index for chunk numbering.
45
- path_gen (Callable): Function to generate file paths.
46
47
  start_marker (Optional[str]): Start marker for query extraction.
47
48
  end_marker (Optional[str]): End marker for query extraction.
49
+ retain_local_copy (bool): Whether to retain the local copy of the files.
50
+ use_consolidation (bool): Whether to use consolidation.
48
51
  """
49
52
 
53
+ _EXTENSION = ".parquet"
54
+
50
55
  def __init__(
51
56
  self,
52
57
  output_path: str = "",
53
58
  output_suffix: str = "",
54
- output_prefix: str = "",
55
59
  typename: Optional[str] = None,
56
60
  chunk_size: Optional[int] = 100000,
57
- buffer_size: Optional[int] = 100000,
61
+ buffer_size: int = 5000,
58
62
  total_record_count: int = 0,
59
63
  chunk_count: int = 0,
60
64
  chunk_start: Optional[int] = None,
61
65
  start_marker: Optional[str] = None,
62
66
  end_marker: Optional[str] = None,
63
67
  retain_local_copy: bool = False,
68
+ use_consolidation: bool = False,
64
69
  ):
65
70
  """Initialize the Parquet output handler.
66
71
 
67
72
  Args:
68
73
  output_path (str): Base path where Parquet files will be written.
69
74
  output_suffix (str): Suffix for output files.
70
- output_prefix (str): Prefix for files when uploading to object store.
71
75
  typename (Optional[str], optional): Type name of the entity e.g database, schema, table.
72
76
  chunk_size (int, optional): Maximum records per chunk. Defaults to 100000.
73
77
  total_record_count (int, optional): Initial total record count. Defaults to 0.
74
78
  chunk_count (int, optional): Initial chunk count. Defaults to 0.
75
79
  chunk_start (Optional[int], optional): Starting index for chunk numbering.
76
80
  Defaults to None.
77
- path_gen (Callable, optional): Function to generate file paths.
78
- Defaults to path_gen function.
79
81
  start_marker (Optional[str], optional): Start marker for query extraction.
80
82
  Defaults to None.
81
83
  end_marker (Optional[str], optional): End marker for query extraction.
82
84
  Defaults to None.
83
85
  retain_local_copy (bool, optional): Whether to retain the local copy of the files.
84
86
  Defaults to False.
87
+ use_consolidation (bool, optional): Whether to use consolidation.
88
+ Defaults to False.
85
89
  """
86
90
  self.output_path = output_path
87
91
  self.output_suffix = output_suffix
88
- self.output_prefix = output_prefix
89
92
  self.typename = typename
90
93
  self.chunk_size = chunk_size
91
94
  self.buffer_size = buffer_size
@@ -98,128 +101,84 @@ class ParquetOutput(Output):
98
101
  DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
99
102
  ) # 90% of DAPR limit as safety buffer
100
103
  self.chunk_start = chunk_start
104
+ self.chunk_part = 0
101
105
  self.start_marker = start_marker
102
106
  self.end_marker = end_marker
103
- self.statistics = []
107
+ self.partitions = []
104
108
  self.metrics = get_metrics()
105
109
  self.retain_local_copy = retain_local_copy
106
110
 
111
+ # Consolidation-specific attributes
112
+ # Use consolidation to efficiently write parquet files in buffered manner
113
+ # since there's no cleaner way to write parquet files incrementally
114
+ self.use_consolidation = use_consolidation
115
+ self.consolidation_threshold = (
116
+ chunk_size or 100000
117
+ ) # Use chunk_size as threshold
118
+ self.current_folder_records = 0 # Track records in current temp folder
119
+ self.temp_folder_index = 0 # Current temp folder index
120
+ self.temp_folders_created: List[int] = [] # Track temp folders for cleanup
121
+ self.current_temp_folder_path: Optional[str] = None # Current temp folder path
122
+
123
+ if self.chunk_start:
124
+ self.chunk_count = self.chunk_start + self.chunk_count
125
+
107
126
  # Create output directory
108
127
  self.output_path = os.path.join(self.output_path, self.output_suffix)
109
128
  if self.typename:
110
129
  self.output_path = os.path.join(self.output_path, self.typename)
111
130
  os.makedirs(self.output_path, exist_ok=True)
112
131
 
113
- def path_gen(
132
+ async def write_batched_dataframe(
114
133
  self,
115
- chunk_start: Optional[int] = None,
116
- chunk_count: int = 0,
117
- start_marker: Optional[str] = None,
118
- end_marker: Optional[str] = None,
119
- ) -> str:
120
- """Generate a file path for a chunk.
121
-
122
- Args:
123
- chunk_start (Optional[int]): Starting index of the chunk, or None for single chunk.
124
- chunk_count (int): Total number of chunks.
125
- start_marker (Optional[str]): Start marker for query extraction.
126
- end_marker (Optional[str]): End marker for query extraction.
127
-
128
- Returns:
129
- str: Generated file path for the chunk.
130
- """
131
- # For Query Extraction - use start and end markers without chunk count
132
- if start_marker and end_marker:
133
- return f"{start_marker}_{end_marker}.parquet"
134
+ batched_dataframe: Union[
135
+ AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
136
+ ],
137
+ ):
138
+ """Write a batched pandas DataFrame to Parquet files with consolidation support.
134
139
 
135
- # For regular chunking - include chunk count
136
- if chunk_start is None:
137
- return f"{str(chunk_count)}.parquet"
138
- else:
139
- return f"chunk-{str(chunk_start)}-part{str(chunk_count)}.parquet"
140
+ This method implements a consolidation strategy to efficiently write parquet files
141
+ in a buffered manner, since there's no cleaner way to write parquet files incrementally.
140
142
 
141
- async def write_dataframe(self, dataframe: "pd.DataFrame"):
142
- """Write a pandas DataFrame to Parquet files and upload to object store.
143
+ The process:
144
+ 1. Accumulate DataFrames into temp folders (buffer_size chunks each)
145
+ 2. When consolidation_threshold is reached, use Daft to merge into optimized files
146
+ 3. Clean up temporary files after consolidation
143
147
 
144
148
  Args:
145
- dataframe (pd.DataFrame): The DataFrame to write.
149
+ batched_dataframe: AsyncGenerator or Generator of pandas DataFrames to write.
146
150
  """
147
- try:
148
- chunk_part = 0
149
- if len(dataframe) == 0:
150
- return
151
-
152
- # Split the DataFrame into chunks
153
- partition = (
154
- self.chunk_size
155
- if self.chunk_start is None
156
- else min(self.chunk_size, self.buffer_size)
157
- )
158
- chunks = [
159
- dataframe[i : i + partition] # type: ignore
160
- for i in range(0, len(dataframe), partition)
161
- ]
162
-
163
- for chunk in chunks:
164
- # Estimate size of this chunk
165
- chunk_size_bytes = self.estimate_dataframe_file_size(chunk, "parquet")
166
-
167
- # Check if adding this chunk would exceed size limit
168
- if (
169
- self.current_buffer_size_bytes + chunk_size_bytes
170
- > self.max_file_size_bytes
171
- and self.current_buffer_size > 0
172
- ):
173
- # Flush current buffer before adding this chunk
174
- chunk_part += 1
175
- await self._flush_buffer(chunk_part)
176
-
177
- self.buffer.append(chunk)
178
- self.current_buffer_size += len(chunk)
179
- self.current_buffer_size_bytes += chunk_size_bytes
180
-
181
- if self.current_buffer_size >= partition: # type: ignore
182
- chunk_part += 1
183
- await self._flush_buffer(chunk_part)
151
+ if not self.use_consolidation:
152
+ # Fallback to base class implementation
153
+ await super().write_batched_dataframe(batched_dataframe)
154
+ return
184
155
 
185
- if self.buffer and self.current_buffer_size > 0:
186
- chunk_part += 1
187
- await self._flush_buffer(chunk_part)
156
+ try:
157
+ # Phase 1: Accumulate DataFrames into temp folders
158
+ if inspect.isasyncgen(batched_dataframe):
159
+ async for dataframe in batched_dataframe:
160
+ if not is_empty_dataframe(dataframe):
161
+ await self._accumulate_dataframe(dataframe)
162
+ else:
163
+ sync_generator = cast(
164
+ Generator["pd.DataFrame", None, None], batched_dataframe
165
+ )
166
+ for dataframe in sync_generator:
167
+ if not is_empty_dataframe(dataframe):
168
+ await self._accumulate_dataframe(dataframe)
188
169
 
189
- # Record metrics for successful write
190
- self.metrics.record_metric(
191
- name="parquet_write_records",
192
- value=len(dataframe),
193
- metric_type=MetricType.COUNTER,
194
- labels={"type": "pandas", "mode": WriteMode.APPEND.value},
195
- description="Number of records written to Parquet files from pandas DataFrame",
196
- )
170
+ # Phase 2: Consolidate any remaining temp folder
171
+ if self.current_folder_records > 0:
172
+ await self._consolidate_current_folder()
197
173
 
198
- # Record chunk metrics
199
- self.metrics.record_metric(
200
- name="parquet_chunks_written",
201
- value=1,
202
- metric_type=MetricType.COUNTER,
203
- labels={"type": "pandas", "mode": WriteMode.APPEND.value},
204
- description="Number of chunks written to Parquet files",
205
- )
174
+ # Phase 3: Cleanup temp folders
175
+ await self._cleanup_temp_folders()
206
176
 
207
- self.chunk_count += 1
208
- self.statistics.append(chunk_part)
209
177
  except Exception as e:
210
- # Record metrics for failed write
211
- self.metrics.record_metric(
212
- name="parquet_write_errors",
213
- value=1,
214
- metric_type=MetricType.COUNTER,
215
- labels={
216
- "type": "pandas",
217
- "mode": WriteMode.APPEND.value,
218
- "error": str(e),
219
- },
220
- description="Number of errors while writing to Parquet files",
178
+ logger.error(
179
+ f"Error in batched dataframe writing with consolidation: {str(e)}"
221
180
  )
222
- logger.error(f"Error writing pandas dataframe to parquet: {str(e)}")
181
+ await self._cleanup_temp_folders() # Cleanup on error
223
182
  raise
224
183
 
225
184
  async def write_daft_dataframe(
@@ -320,7 +279,13 @@ class ParquetOutput(Output):
320
279
  name="parquet_write_errors",
321
280
  value=1,
322
281
  metric_type=MetricType.COUNTER,
323
- labels={"type": "daft", "mode": write_mode, "error": str(e)},
282
+ labels={
283
+ "type": "daft",
284
+ "mode": write_mode.value
285
+ if isinstance(write_mode, WriteMode)
286
+ else write_mode,
287
+ "error": str(e),
288
+ },
324
289
  description="Number of errors while writing to Parquet files",
325
290
  )
326
291
  logger.error(f"Error writing daft dataframe to parquet: {str(e)}")
@@ -334,67 +299,171 @@ class ParquetOutput(Output):
334
299
  """
335
300
  return self.output_path
336
301
 
337
- async def _flush_buffer(self, chunk_part: int):
338
- """Flush the current buffer to a Parquet file.
339
-
340
- This method combines all DataFrames in the buffer, writes them to a Parquet file,
341
- and uploads the file to the object store.
302
+ # Consolidation helper methods
303
+
304
+ def _get_temp_folder_path(self, folder_index: int) -> str:
305
+ """Generate temp folder path consistent with existing structure."""
306
+ temp_base_path = os.path.join(self.output_path, "temp_accumulation")
307
+ return os.path.join(temp_base_path, f"folder-{folder_index}")
308
+
309
+ def _get_consolidated_file_path(self, folder_index: int, chunk_part: int) -> str:
310
+ """Generate final consolidated file path using existing path_gen logic."""
311
+ return os.path.join(
312
+ self.output_path,
313
+ self.path_gen(chunk_count=folder_index, chunk_part=chunk_part),
314
+ )
315
+
316
+ async def _accumulate_dataframe(self, dataframe: "pd.DataFrame"):
317
+ """Accumulate DataFrame into temp folders, writing in buffer_size chunks."""
318
+
319
+ # Process dataframe in buffer_size chunks
320
+ for i in range(0, len(dataframe), self.buffer_size):
321
+ chunk = dataframe[i : i + self.buffer_size]
322
+
323
+ # Check if we need to consolidate current folder before adding this chunk
324
+ if (
325
+ self.current_folder_records + len(chunk)
326
+ ) > self.consolidation_threshold:
327
+ if self.current_folder_records > 0:
328
+ await self._consolidate_current_folder()
329
+ self._start_new_temp_folder()
330
+
331
+ # Ensure we have a temp folder ready
332
+ if self.current_temp_folder_path is None:
333
+ self._start_new_temp_folder()
334
+
335
+ # Write chunk to current temp folder
336
+ await self._write_chunk_to_temp_folder(cast("pd.DataFrame", chunk))
337
+ self.current_folder_records += len(chunk)
338
+
339
+ def _start_new_temp_folder(self):
340
+ """Start a new temp folder for accumulation and create the directory."""
341
+ if self.current_temp_folder_path is not None:
342
+ self.temp_folders_created.append(self.temp_folder_index)
343
+ self.temp_folder_index += 1
344
+
345
+ self.current_folder_records = 0
346
+ self.current_temp_folder_path = self._get_temp_folder_path(
347
+ self.temp_folder_index
348
+ )
349
+
350
+ # Create the directory
351
+ os.makedirs(self.current_temp_folder_path, exist_ok=True)
352
+
353
+ async def _write_chunk_to_temp_folder(self, chunk: "pd.DataFrame"):
354
+ """Write a chunk to the current temp folder."""
355
+ if self.current_temp_folder_path is None:
356
+ raise ValueError("No temp folder path available")
357
+
358
+ # Generate file name for this chunk within the temp folder
359
+ existing_files = len(
360
+ [
361
+ f
362
+ for f in os.listdir(self.current_temp_folder_path)
363
+ if f.endswith(".parquet")
364
+ ]
365
+ )
366
+ chunk_file_name = f"chunk-{existing_files}.parquet"
367
+ chunk_file_path = os.path.join(self.current_temp_folder_path, chunk_file_name)
342
368
 
343
- Note:
344
- If the buffer is empty or has no records, the method returns without writing.
345
- """
346
- import pandas as pd
369
+ # Write chunk using existing write_chunk method
370
+ await self.write_chunk(chunk, chunk_file_path)
347
371
 
348
- if not self.buffer or not self.current_buffer_size:
372
+ async def _consolidate_current_folder(self):
373
+ """Consolidate current temp folder using Daft."""
374
+ if self.current_folder_records == 0 or self.current_temp_folder_path is None:
349
375
  return
350
376
 
351
- if not all(isinstance(df, pd.DataFrame) for df in self.buffer):
352
- raise TypeError(
353
- "_flush_buffer encountered non-DataFrame elements in buffer. This should not happen."
354
- )
355
-
356
377
  try:
357
- # Now it's safe to cast for pd.concat
358
- pd_buffer: List[pd.DataFrame] = self.buffer # type: ignore
359
- combined_dataframe = pd.concat(pd_buffer)
360
-
361
- # Write DataFrame to Parquet file
362
- if not combined_dataframe.empty:
363
- self.total_record_count += len(combined_dataframe)
364
- output_file_name = (
365
- f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
366
- )
367
- combined_dataframe.to_parquet(
368
- output_file_name, index=False, compression="snappy"
369
- )
378
+ import daft
370
379
 
371
- # Record chunk metrics
372
- self.metrics.record_metric(
373
- name="parquet_chunks_written",
374
- value=1,
375
- metric_type=MetricType.COUNTER,
376
- labels={"type": "pandas"},
377
- description="Number of chunks written to Parquet files",
378
- )
380
+ # Read all parquet files in temp folder
381
+ pattern = os.path.join(self.current_temp_folder_path, "*.parquet")
382
+ daft_df = daft.read_parquet(pattern)
383
+ partitions = 0
379
384
 
380
- # Push the file to the object store
381
- await ObjectStore.upload_file(
382
- source=output_file_name,
383
- destination=get_object_store_prefix(output_file_name),
384
- )
385
-
386
- self.buffer.clear()
387
- self.current_buffer_size = 0
388
- self.current_buffer_size_bytes = 0
385
+ # Write consolidated file using Daft with size management
386
+ with daft.execution_config_ctx(
387
+ parquet_target_filesize=self.max_file_size_bytes
388
+ ):
389
+ # Write to a temp location first
390
+ temp_consolidated_dir = f"{self.current_temp_folder_path}_temp"
391
+ result = daft_df.write_parquet(root_dir=temp_consolidated_dir)
392
+
393
+ # Get the generated file path and rename to final location
394
+ result_dict = result.to_pydict()
395
+ partitions = len(result_dict["path"])
396
+ for i, file_path in enumerate(result_dict["path"]):
397
+ if file_path.endswith(".parquet"):
398
+ consolidated_file_path = self._get_consolidated_file_path(
399
+ folder_index=self.chunk_count,
400
+ chunk_part=i,
401
+ )
402
+ os.rename(file_path, consolidated_file_path)
403
+
404
+ # Upload consolidated file to object store
405
+ await ObjectStore.upload_file(
406
+ source=consolidated_file_path,
407
+ destination=get_object_store_prefix(consolidated_file_path),
408
+ )
409
+
410
+ # Clean up temp consolidated dir
411
+ shutil.rmtree(temp_consolidated_dir, ignore_errors=True)
412
+
413
+ # Update statistics
414
+ self.chunk_count += 1
415
+ self.total_record_count += self.current_folder_records
416
+ self.partitions.append(partitions)
389
417
 
390
- except Exception as e:
391
- # Record metrics for failed write
418
+ # Record metrics
392
419
  self.metrics.record_metric(
393
- name="parquet_write_errors",
420
+ name="consolidated_files",
394
421
  value=1,
395
422
  metric_type=MetricType.COUNTER,
396
- labels={"type": "pandas", "error": str(e)},
397
- description="Number of errors while writing to Parquet files",
423
+ labels={"type": "daft_consolidation"},
424
+ description="Number of consolidated parquet files created",
425
+ )
426
+
427
+ logger.info(
428
+ f"Consolidated folder {self.temp_folder_index} with {self.current_folder_records} records"
398
429
  )
399
- logger.error(f"Error flushing buffer to parquet: {str(e)}")
400
- raise e
430
+
431
+ except Exception as e:
432
+ logger.error(
433
+ f"Error consolidating folder {self.temp_folder_index}: {str(e)}"
434
+ )
435
+ raise
436
+
437
+ async def _cleanup_temp_folders(self):
438
+ """Clean up all temp folders after consolidation."""
439
+ try:
440
+ # Add current folder to cleanup list if it exists
441
+ if self.current_temp_folder_path is not None:
442
+ self.temp_folders_created.append(self.temp_folder_index)
443
+
444
+ # Clean up all temp folders
445
+ for folder_index in self.temp_folders_created:
446
+ temp_folder = self._get_temp_folder_path(folder_index)
447
+ if os.path.exists(temp_folder):
448
+ shutil.rmtree(temp_folder, ignore_errors=True)
449
+
450
+ # Clean up base temp directory if it exists and is empty
451
+ temp_base_path = os.path.join(self.output_path, "temp_accumulation")
452
+ if os.path.exists(temp_base_path) and not os.listdir(temp_base_path):
453
+ os.rmdir(temp_base_path)
454
+
455
+ # Reset state
456
+ self.temp_folders_created.clear()
457
+ self.current_temp_folder_path = None
458
+ self.temp_folder_index = 0
459
+ self.current_folder_records = 0
460
+
461
+ except Exception as e:
462
+ logger.warning(f"Error cleaning up temp folders: {str(e)}")
463
+
464
+ async def write_chunk(self, chunk: "pd.DataFrame", file_name: str):
465
+ """Write a chunk to a Parquet file.
466
+
467
+ This method writes a chunk to a Parquet file and uploads the file to the object store.
468
+ """
469
+ chunk.to_parquet(file_name, index=False, compression="snappy")
@@ -0,0 +1,4 @@
1
+ from application_sdk.server.mcp.models import MCPMetadata
2
+ from application_sdk.server.mcp.server import MCPServer
3
+
4
+ __all__ = ["MCPServer", "MCPMetadata"]
@@ -0,0 +1,11 @@
1
+ from typing import Any, Dict, Optional, Tuple
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class MCPMetadata(BaseModel):
7
+ name: str
8
+ description: Optional[str]
9
+ visible: bool
10
+ args: Tuple[Any, ...] = ()
11
+ kwargs: Dict[str, Any] = {}
@@ -0,0 +1,96 @@
1
+ """
2
+ MCP Server implementation using FastMCP for Atlan Application SDK.
3
+
4
+ This module provides the MCPServer class that automatically discovers
5
+ activities marked with @mcp_tool decorators and mounts them on FastAPI
6
+ using streamable HTTP transport.
7
+ """
8
+
9
+ from typing import Any, Callable, List, Optional, Tuple, Type
10
+
11
+ from fastmcp import FastMCP
12
+ from fastmcp.server.http import StarletteWithLifespan
13
+
14
+ from application_sdk.activities import ActivitiesInterface
15
+ from application_sdk.constants import MCP_METADATA_KEY
16
+ from application_sdk.observability.logger_adaptor import get_logger
17
+ from application_sdk.server.mcp.models import MCPMetadata
18
+ from application_sdk.workflows import WorkflowInterface
19
+
20
+
21
+ class MCPServer:
22
+ """
23
+ MCP Server using FastMCP 2.0 with FastAPI mounting capability.
24
+
25
+ This server automatically discovers activities marked with @mcp_tool
26
+ and creates a FastMCP server that can be mounted on FastAPI.
27
+ """
28
+
29
+ def __init__(self, application_name: str, instructions: Optional[str] = None):
30
+ """
31
+ Initialize the MCP server.
32
+
33
+ Args:
34
+ application_name (str): Name of the application
35
+ instructions (Optional[str]): Description for the MCP server
36
+ """
37
+ self.application_name = application_name
38
+
39
+ self.logger = get_logger(__name__)
40
+
41
+ # FastMCP Server
42
+ self.server = FastMCP(
43
+ name=f"{application_name} MCP",
44
+ instructions=instructions,
45
+ on_duplicate_tools="error",
46
+ )
47
+
48
+ async def register_tools(
49
+ self,
50
+ workflow_and_activities_classes: List[
51
+ Tuple[Type[WorkflowInterface], Type[ActivitiesInterface]]
52
+ ],
53
+ ) -> None:
54
+ """
55
+ Discover activities marked with @mcp_tool and register them.
56
+
57
+ Args:
58
+ workflow_and_activities_classes: List of (workflow_class, activities_class) tuples
59
+ """
60
+ activity_methods: List[Callable[..., Any]] = []
61
+ for workflow_class, activities_class in workflow_and_activities_classes:
62
+ activities_instance = activities_class()
63
+ activity_methods.extend(workflow_class.get_activities(activities_instance)) # type: ignore
64
+
65
+ for f in activity_methods:
66
+ mcp_metadata: Optional[MCPMetadata] = getattr(f, MCP_METADATA_KEY, None)
67
+ if not mcp_metadata:
68
+ self.logger.info(
69
+ f"No MCP metadata found on activity method {f.__name__}. Skipping tool registration"
70
+ )
71
+ continue
72
+
73
+ if mcp_metadata.visible:
74
+ self.logger.info(
75
+ f"Registering tool {mcp_metadata.name} with description: {mcp_metadata.description}"
76
+ )
77
+ self.server.tool(
78
+ f,
79
+ name=mcp_metadata.name,
80
+ description=mcp_metadata.description,
81
+ *mcp_metadata.args,
82
+ **mcp_metadata.kwargs,
83
+ )
84
+ else:
85
+ self.logger.info(
86
+ f"Tool {mcp_metadata.name} is marked as not visible. Skipping tool registration"
87
+ )
88
+
89
+ tools = await self.server.get_tools()
90
+ self.logger.info(f"Registered {len(tools)} tools: {list(tools.keys())}")
91
+
92
+ async def get_http_app(self) -> StarletteWithLifespan:
93
+ """
94
+ Get the HTTP app for the MCP server.
95
+ """
96
+ return self.server.http_app()
@@ -415,7 +415,7 @@ class QueryBasedTransformer(TransformerInterface):
415
415
  )
416
416
 
417
417
  # run the SQL on the dataframe
418
- logger.info(
418
+ logger.debug(
419
419
  f"Running transformer for asset [{typename}] with SQL:\n {entity_sql_template}"
420
420
  )
421
421
  transformed_df = daft.sql(entity_sql_template)
@@ -2,4 +2,4 @@
2
2
  Version information for the application_sdk package.
3
3
  """
4
4
 
5
- __version__ = "0.1.1rc43"
5
+ __version__ = "0.1.1rc45"
@@ -10,6 +10,7 @@ from typing import Any, Callable, Coroutine, Dict, List, Sequence, Type
10
10
 
11
11
  from temporalio import workflow
12
12
  from temporalio.common import RetryPolicy
13
+ from typing_extensions import Tuple
13
14
 
14
15
  from application_sdk.activities.common.models import ActivityStatistics
15
16
  from application_sdk.activities.metadata_extraction.sql import (
@@ -152,15 +153,15 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
152
153
 
153
154
  def get_transform_batches(
154
155
  self, chunk_count: int, typename: str, partitions: List[int]
155
- ):
156
+ ) -> Tuple[List[List[str]], List[int]]: # noqa: F821
156
157
  """Get batches for parallel transformation processing.
157
158
 
158
159
  Args:
159
160
  chunk_count (int): Total number of chunks to process.
160
161
  typename (str): Type name for the chunks.
161
-
162
+ partitions (List[int]): List of partitions for each chunk.
162
163
  Returns:
163
- Tuple[List[List[str]], List[int]]: A tuple containing:
164
+ Tuple[List[List[str]], List[int]]: A list of file paths.
164
165
  - List of batches, where each batch is a list of file paths
165
166
  - List of starting chunk numbers for each batch
166
167
  """
@@ -174,7 +175,7 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
174
175
  # Each batch contains exactly one chunk
175
176
  batches.append(
176
177
  [
177
- f"{typename}/chunk-{i}-part{file+1}.parquet"
178
+ f"{typename}/chunk-{i}-part{file}.parquet"
178
179
  for file in range(partition)
179
180
  ]
180
181
  )