atlan-application-sdk 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. application_sdk/activities/common/sql_utils.py +308 -0
  2. application_sdk/activities/common/utils.py +1 -45
  3. application_sdk/activities/metadata_extraction/sql.py +110 -353
  4. application_sdk/activities/query_extraction/sql.py +12 -11
  5. application_sdk/application/__init__.py +1 -1
  6. application_sdk/clients/sql.py +167 -1
  7. application_sdk/clients/temporal.py +6 -6
  8. application_sdk/common/types.py +8 -0
  9. application_sdk/common/utils.py +1 -8
  10. application_sdk/constants.py +1 -1
  11. application_sdk/handlers/sql.py +10 -25
  12. application_sdk/interceptors/events.py +1 -1
  13. application_sdk/io/__init__.py +654 -0
  14. application_sdk/io/json.py +429 -0
  15. application_sdk/{outputs → io}/parquet.py +358 -47
  16. application_sdk/io/utils.py +307 -0
  17. application_sdk/observability/observability.py +23 -12
  18. application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
  19. application_sdk/server/fastapi/middleware/metrics.py +27 -24
  20. application_sdk/server/fastapi/models.py +1 -1
  21. application_sdk/server/fastapi/routers/server.py +1 -1
  22. application_sdk/server/fastapi/utils.py +10 -0
  23. application_sdk/services/eventstore.py +4 -4
  24. application_sdk/services/objectstore.py +30 -7
  25. application_sdk/services/secretstore.py +1 -1
  26. application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
  27. application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
  28. application_sdk/version.py +1 -1
  29. application_sdk/worker.py +1 -1
  30. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/METADATA +9 -11
  31. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/RECORD +36 -43
  32. application_sdk/common/dataframe_utils.py +0 -42
  33. application_sdk/events/__init__.py +0 -5
  34. application_sdk/inputs/.cursor/BUGBOT.md +0 -250
  35. application_sdk/inputs/__init__.py +0 -168
  36. application_sdk/inputs/iceberg.py +0 -75
  37. application_sdk/inputs/json.py +0 -136
  38. application_sdk/inputs/parquet.py +0 -272
  39. application_sdk/inputs/sql_query.py +0 -271
  40. application_sdk/outputs/.cursor/BUGBOT.md +0 -295
  41. application_sdk/outputs/__init__.py +0 -445
  42. application_sdk/outputs/iceberg.py +0 -139
  43. application_sdk/outputs/json.py +0 -268
  44. /application_sdk/{events → interceptors}/models.py +0 -0
  45. /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
  46. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/WHEEL +0 -0
  47. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE +0 -0
  48. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE +0 -0
@@ -1,445 +0,0 @@
1
- """Output module for handling data output operations.
2
-
3
- This module provides base classes and utilities for handling various types of data outputs
4
- in the application, including file outputs and object store interactions.
5
- """
6
-
7
- import gc
8
- import inspect
9
- import os
10
- from abc import ABC, abstractmethod
11
- from enum import Enum
12
- from typing import (
13
- TYPE_CHECKING,
14
- Any,
15
- AsyncGenerator,
16
- Dict,
17
- Generator,
18
- List,
19
- Optional,
20
- Union,
21
- cast,
22
- )
23
-
24
- import orjson
25
- from temporalio import activity
26
-
27
- from application_sdk.activities.common.models import ActivityStatistics
28
- from application_sdk.activities.common.utils import get_object_store_prefix
29
- from application_sdk.common.dataframe_utils import is_empty_dataframe
30
- from application_sdk.observability.logger_adaptor import get_logger
31
- from application_sdk.observability.metrics_adaptor import MetricType
32
- from application_sdk.services.objectstore import ObjectStore
33
-
34
- logger = get_logger(__name__)
35
- activity.logger = logger
36
-
37
-
38
- if TYPE_CHECKING:
39
- import daft # type: ignore
40
- import pandas as pd
41
-
42
-
43
- class WriteMode(Enum):
44
- """Enumeration of write modes for output operations."""
45
-
46
- APPEND = "append"
47
- OVERWRITE = "overwrite"
48
- OVERWRITE_PARTITIONS = "overwrite-partitions"
49
-
50
-
51
- class Output(ABC):
52
- """Abstract base class for output handlers.
53
-
54
- This class defines the interface for output handlers that can write data
55
- to various destinations in different formats.
56
-
57
- Attributes:
58
- output_path (str): Path where the output will be written.
59
- upload_file_prefix (str): Prefix for files when uploading to object store.
60
- total_record_count (int): Total number of records processed.
61
- chunk_count (int): Number of chunks the output was split into.
62
- """
63
-
64
- output_path: str
65
- output_prefix: str
66
- total_record_count: int
67
- chunk_count: int
68
- chunk_part: int
69
- buffer_size: int
70
- max_file_size_bytes: int
71
- current_buffer_size: int
72
- current_buffer_size_bytes: int
73
- partitions: List[int]
74
-
75
- def estimate_dataframe_record_size(self, dataframe: "pd.DataFrame") -> int:
76
- """Estimate File size of a DataFrame by sampling a few records."""
77
- if len(dataframe) == 0:
78
- return 0
79
-
80
- # Sample up to 10 records to estimate average size
81
- sample_size = min(10, len(dataframe))
82
- sample = dataframe.head(sample_size)
83
- file_type = type(self).__name__.lower().replace("output", "")
84
- compression_factor = 1
85
- if file_type == "json":
86
- sample_file = sample.to_json(orient="records", lines=True)
87
- else:
88
- sample_file = sample.to_parquet(index=False, compression="snappy")
89
- compression_factor = 0.01
90
- if sample_file is not None:
91
- avg_record_size = len(sample_file) / sample_size * compression_factor
92
- return int(avg_record_size)
93
-
94
- return 0
95
-
96
- def path_gen(
97
- self,
98
- chunk_count: Optional[int] = None,
99
- chunk_part: int = 0,
100
- start_marker: Optional[str] = None,
101
- end_marker: Optional[str] = None,
102
- ) -> str:
103
- """Generate a file path for a chunk.
104
-
105
- Args:
106
- chunk_start (Optional[int]): Starting index of the chunk, or None for single chunk.
107
- chunk_count (int): Total number of chunks.
108
- start_marker (Optional[str]): Start marker for query extraction.
109
- end_marker (Optional[str]): End marker for query extraction.
110
-
111
- Returns:
112
- str: Generated file path for the chunk.
113
- """
114
- # For Query Extraction - use start and end markers without chunk count
115
- if start_marker and end_marker:
116
- return f"{start_marker}_{end_marker}{self._EXTENSION}"
117
-
118
- # For regular chunking - include chunk count
119
- if chunk_count is None:
120
- return f"{str(chunk_part)}{self._EXTENSION}"
121
- else:
122
- return f"chunk-{str(chunk_count)}-part{str(chunk_part)}{self._EXTENSION}"
123
-
124
- def process_null_fields(
125
- self,
126
- obj: Any,
127
- preserve_fields: Optional[List[str]] = None,
128
- null_to_empty_dict_fields: Optional[List[str]] = None,
129
- ) -> Any:
130
- """
131
- By default the method removes null values from dictionaries and lists.
132
- Except for the fields specified in preserve_fields.
133
- And fields in null_to_empty_dict_fields are replaced with empty dict if null.
134
-
135
- Args:
136
- obj: The object to clean (dict, list, or other value)
137
- preserve_fields: Optional list of field names that should be preserved even if they contain null values
138
- null_to_empty_dict_fields: Optional list of field names that should be replaced with empty dict if null
139
-
140
- Returns:
141
- The cleaned object with null values removed
142
- """
143
- if isinstance(obj, dict):
144
- result = {}
145
- for k, v in obj.items():
146
- # Handle null fields that should be converted to empty dicts
147
- if k in (null_to_empty_dict_fields or []) and v is None:
148
- result[k] = {}
149
- continue
150
-
151
- # Process the value recursively
152
- processed_value = self.process_null_fields(
153
- v, preserve_fields, null_to_empty_dict_fields
154
- )
155
-
156
- # Keep the field if it's in preserve_fields or has a non-None processed value
157
- if k in (preserve_fields or []) or processed_value is not None:
158
- result[k] = processed_value
159
-
160
- return result
161
- return obj
162
-
163
- async def write_batched_dataframe(
164
- self,
165
- batched_dataframe: Union[
166
- AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
167
- ],
168
- ):
169
- """Write a batched pandas DataFrame to Output.
170
-
171
- This method writes the DataFrame to Output provided, potentially splitting it
172
- into chunks based on chunk_size and buffer_size settings.
173
-
174
- Args:
175
- dataframe (pd.DataFrame): The DataFrame to write.
176
-
177
- Note:
178
- If the DataFrame is empty, the method returns without writing.
179
- """
180
- try:
181
- if inspect.isasyncgen(batched_dataframe):
182
- async for dataframe in batched_dataframe:
183
- if not is_empty_dataframe(dataframe):
184
- await self.write_dataframe(dataframe)
185
- else:
186
- # Cast to Generator since we've confirmed it's not an AsyncGenerator
187
- sync_generator = cast(
188
- Generator["pd.DataFrame", None, None], batched_dataframe
189
- )
190
- for dataframe in sync_generator:
191
- if not is_empty_dataframe(dataframe):
192
- await self.write_dataframe(dataframe)
193
- except Exception as e:
194
- logger.error(f"Error writing batched dataframe: {str(e)}")
195
- raise
196
-
197
- async def write_dataframe(self, dataframe: "pd.DataFrame"):
198
- """Write a pandas DataFrame to Parquet files and upload to object store.
199
-
200
- Args:
201
- dataframe (pd.DataFrame): The DataFrame to write.
202
- """
203
- try:
204
- if self.chunk_start is None:
205
- self.chunk_part = 0
206
- if len(dataframe) == 0:
207
- return
208
-
209
- chunk_size_bytes = self.estimate_dataframe_record_size(dataframe)
210
-
211
- for i in range(0, len(dataframe), self.buffer_size):
212
- chunk = dataframe[i : i + self.buffer_size]
213
-
214
- if (
215
- self.current_buffer_size_bytes + chunk_size_bytes
216
- > self.max_file_size_bytes
217
- ):
218
- output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part)}"
219
- if os.path.exists(output_file_name):
220
- await self._upload_file(output_file_name)
221
- self.chunk_part += 1
222
-
223
- self.current_buffer_size += len(chunk)
224
- self.current_buffer_size_bytes += chunk_size_bytes * len(chunk)
225
- await self._flush_buffer(chunk, self.chunk_part)
226
-
227
- del chunk
228
- gc.collect()
229
-
230
- if self.current_buffer_size_bytes > 0:
231
- # Finally upload the final file to the object store
232
- output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part)}"
233
- if os.path.exists(output_file_name):
234
- await self._upload_file(output_file_name)
235
- self.chunk_part += 1
236
-
237
- # Record metrics for successful write
238
- self.metrics.record_metric(
239
- name="write_records",
240
- value=len(dataframe),
241
- metric_type=MetricType.COUNTER,
242
- labels={"type": "pandas", "mode": WriteMode.APPEND.value},
243
- description="Number of records written to files from pandas DataFrame",
244
- )
245
-
246
- # Record chunk metrics
247
- self.metrics.record_metric(
248
- name="chunks_written",
249
- value=1,
250
- metric_type=MetricType.COUNTER,
251
- labels={"type": "pandas", "mode": WriteMode.APPEND.value},
252
- description="Number of chunks written to files",
253
- )
254
-
255
- # If chunk_start is set we don't want to increment the chunk_count
256
- # Since it should only increment the chunk_part in this case
257
- if self.chunk_start is None:
258
- self.chunk_count += 1
259
- self.partitions.append(self.chunk_part)
260
- except Exception as e:
261
- # Record metrics for failed write
262
- self.metrics.record_metric(
263
- name="write_errors",
264
- value=1,
265
- metric_type=MetricType.COUNTER,
266
- labels={
267
- "type": "pandas",
268
- "mode": WriteMode.APPEND.value,
269
- "error": str(e),
270
- },
271
- description="Number of errors while writing to files",
272
- )
273
- logger.error(f"Error writing pandas dataframe to files: {str(e)}")
274
- raise
275
-
276
- async def write_batched_daft_dataframe(
277
- self,
278
- batched_dataframe: Union[
279
- AsyncGenerator["daft.DataFrame", None], # noqa: F821
280
- Generator["daft.DataFrame", None, None], # noqa: F821
281
- ],
282
- ):
283
- """Write a batched daft DataFrame to JSON files.
284
-
285
- This method writes the DataFrame to JSON files, potentially splitting it
286
- into chunks based on chunk_size and buffer_size settings.
287
-
288
- Args:
289
- dataframe (daft.DataFrame): The DataFrame to write.
290
-
291
- Note:
292
- If the DataFrame is empty, the method returns without writing.
293
- """
294
- try:
295
- if inspect.isasyncgen(batched_dataframe):
296
- async for dataframe in batched_dataframe:
297
- if not is_empty_dataframe(dataframe):
298
- await self.write_daft_dataframe(dataframe)
299
- else:
300
- # Cast to Generator since we've confirmed it's not an AsyncGenerator
301
- sync_generator = cast(
302
- Generator["daft.DataFrame", None, None], batched_dataframe
303
- ) # noqa: F821
304
- for dataframe in sync_generator:
305
- if not is_empty_dataframe(dataframe):
306
- await self.write_daft_dataframe(dataframe)
307
- except Exception as e:
308
- logger.error(f"Error writing batched daft dataframe: {str(e)}")
309
-
310
- @abstractmethod
311
- async def write_daft_dataframe(self, dataframe: "daft.DataFrame"): # noqa: F821
312
- """Write a daft DataFrame to the output destination.
313
-
314
- Args:
315
- dataframe (daft.DataFrame): The DataFrame to write.
316
- """
317
- pass
318
-
319
- async def get_statistics(
320
- self, typename: Optional[str] = None
321
- ) -> ActivityStatistics:
322
- """Returns statistics about the output.
323
-
324
- This method returns a ActivityStatistics object with total record count and chunk count.
325
-
326
- Args:
327
- typename (str): Type name of the entity e.g database, schema, table.
328
-
329
- Raises:
330
- ValidationError: If the statistics data is invalid
331
- Exception: If there's an error writing the statistics
332
- """
333
- try:
334
- statistics = await self.write_statistics(typename)
335
- if not statistics:
336
- raise ValueError("No statistics data available")
337
- statistics = ActivityStatistics.model_validate(statistics)
338
- if typename:
339
- statistics.typename = typename
340
- return statistics
341
- except Exception as e:
342
- logger.error(f"Error getting statistics: {str(e)}")
343
- raise
344
-
345
- async def _upload_file(self, file_name: str):
346
- """Upload a file to the object store."""
347
- await ObjectStore.upload_file(
348
- source=file_name,
349
- destination=get_object_store_prefix(file_name),
350
- )
351
-
352
- self.current_buffer_size_bytes = 0
353
-
354
- async def _flush_buffer(self, chunk: "pd.DataFrame", chunk_part: int):
355
- """Flush the current buffer to a JSON file.
356
-
357
- This method combines all DataFrames in the buffer, writes them to a JSON file,
358
- and uploads the file to the object store.
359
-
360
- Note:
361
- If the buffer is empty or has no records, the method returns without writing.
362
- """
363
- try:
364
- if not is_empty_dataframe(chunk):
365
- self.total_record_count += len(chunk)
366
- output_file_name = (
367
- f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
368
- )
369
- await self.write_chunk(chunk, output_file_name)
370
-
371
- self.current_buffer_size = 0
372
-
373
- # Record chunk metrics
374
- self.metrics.record_metric(
375
- name="chunks_written",
376
- value=1,
377
- metric_type=MetricType.COUNTER,
378
- labels={"type": "output"},
379
- description="Number of chunks written to files",
380
- )
381
-
382
- except Exception as e:
383
- # Record metrics for failed write
384
- self.metrics.record_metric(
385
- name="write_errors",
386
- value=1,
387
- metric_type=MetricType.COUNTER,
388
- labels={"type": "output", "error": str(e)},
389
- description="Number of errors while writing to files",
390
- )
391
- logger.error(f"Error flushing buffer to files: {str(e)}")
392
- raise e
393
-
394
- async def write_statistics(
395
- self, typename: Optional[str] = None
396
- ) -> Optional[Dict[str, Any]]:
397
- """Write statistics about the output to a JSON file.
398
-
399
- This method writes statistics including total record count and chunk count
400
- to a JSON file and uploads it to the object store.
401
-
402
- Raises:
403
- Exception: If there's an error writing or uploading the statistics.
404
- """
405
- try:
406
- # prepare the statistics
407
- statistics = {
408
- "total_record_count": self.total_record_count,
409
- "chunk_count": len(self.partitions),
410
- "partitions": self.partitions,
411
- }
412
-
413
- # Ensure typename is included in the statistics payload (if provided)
414
- if typename:
415
- statistics["typename"] = typename
416
-
417
- # Write the statistics to a json file inside a dedicated statistics/ folder
418
- statistics_dir = os.path.join(self.output_path, "statistics")
419
- os.makedirs(statistics_dir, exist_ok=True)
420
- output_file_name = os.path.join(statistics_dir, "statistics.json.ignore")
421
- # If chunk_start is provided, include it in the statistics filename
422
- try:
423
- cs = getattr(self, "chunk_start", None)
424
- if cs is not None:
425
- output_file_name = os.path.join(
426
- statistics_dir, f"statistics-chunk-{cs}.json.ignore"
427
- )
428
- except Exception:
429
- # If accessing chunk_start fails, fallback to default filename
430
- pass
431
-
432
- # Write the statistics dictionary to the JSON file
433
- with open(output_file_name, "wb") as f:
434
- f.write(orjson.dumps(statistics))
435
-
436
- destination_file_path = get_object_store_prefix(output_file_name)
437
- # Push the file to the object store
438
- await ObjectStore.upload_file(
439
- source=output_file_name,
440
- destination=destination_file_path,
441
- )
442
-
443
- return statistics
444
- except Exception as e:
445
- logger.error(f"Error writing statistics: {str(e)}")
@@ -1,139 +0,0 @@
1
- from typing import TYPE_CHECKING, Union
2
-
3
- from pyiceberg.catalog import Catalog
4
- from pyiceberg.table import Table
5
- from temporalio import activity
6
-
7
- from application_sdk.observability.logger_adaptor import get_logger
8
- from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
9
- from application_sdk.outputs import Output
10
-
11
- logger = get_logger(__name__)
12
- activity.logger = logger
13
-
14
- if TYPE_CHECKING:
15
- import daft
16
- import pandas as pd
17
-
18
-
19
- class IcebergOutput(Output):
20
- """
21
- Iceberg Output class to write data to Iceberg tables using daft and pandas
22
- """
23
-
24
- def __init__(
25
- self,
26
- iceberg_catalog: Catalog,
27
- iceberg_namespace: str,
28
- iceberg_table: Union[str, Table],
29
- mode: str = "append",
30
- total_record_count: int = 0,
31
- chunk_count: int = 0,
32
- retain_local_copy: bool = False,
33
- ):
34
- """Initialize the Iceberg output class.
35
-
36
- Args:
37
- iceberg_catalog (Catalog): Iceberg catalog object.
38
- iceberg_namespace (str): Iceberg namespace.
39
- iceberg_table (Union[str, Table]): Iceberg table object or table name.
40
- mode (str, optional): Write mode for the iceberg table. Defaults to "append".
41
- total_record_count (int, optional): Total record count written to the iceberg table. Defaults to 0.
42
- chunk_count (int, optional): Number of chunks written to the iceberg table. Defaults to 0.
43
- retain_local_copy (bool, optional): Whether to retain the local copy of the files.
44
- Defaults to False.
45
- """
46
- self.total_record_count = total_record_count
47
- self.chunk_count = chunk_count
48
- self.iceberg_catalog = iceberg_catalog
49
- self.iceberg_namespace = iceberg_namespace
50
- self.iceberg_table = iceberg_table
51
- self.mode = mode
52
- self.metrics = get_metrics()
53
- self.retain_local_copy = retain_local_copy
54
-
55
- async def write_dataframe(self, dataframe: "pd.DataFrame"):
56
- """
57
- Method to write the pandas dataframe to an iceberg table
58
- """
59
- try:
60
- import daft
61
-
62
- if len(dataframe) == 0:
63
- return
64
- # convert the pandas dataframe to a daft dataframe
65
- daft_dataframe = daft.from_pandas(dataframe)
66
- await self.write_daft_dataframe(daft_dataframe)
67
-
68
- # Record metrics for successful write
69
- self.metrics.record_metric(
70
- name="iceberg_write_records",
71
- value=len(dataframe),
72
- metric_type=MetricType.COUNTER,
73
- labels={"mode": self.mode, "type": "pandas"},
74
- description="Number of records written to Iceberg table from pandas DataFrame",
75
- )
76
- except Exception as e:
77
- # Record metrics for failed write
78
- self.metrics.record_metric(
79
- name="iceberg_write_errors",
80
- value=1,
81
- metric_type=MetricType.COUNTER,
82
- labels={"mode": self.mode, "type": "pandas", "error": str(e)},
83
- description="Number of errors while writing to Iceberg table",
84
- )
85
- logger.error(f"Error writing pandas dataframe to iceberg table: {str(e)}")
86
- raise e
87
-
88
- async def write_daft_dataframe(self, dataframe: "daft.DataFrame"): # noqa: F821
89
- """
90
- Method to write the daft dataframe to an iceberg table
91
- """
92
- try:
93
- if dataframe.count_rows() == 0:
94
- return
95
- # Create a new table in the iceberg catalog
96
- self.chunk_count += 1
97
- self.total_record_count += dataframe.count_rows()
98
-
99
- # check if iceberg table is already created
100
- if isinstance(self.iceberg_table, Table):
101
- # if yes, use the existing iceberg table
102
- table = self.iceberg_table
103
- else:
104
- # if not, create a new table in the iceberg catalog
105
- table = self.iceberg_catalog.create_table_if_not_exists(
106
- f"{self.iceberg_namespace}.{self.iceberg_table}",
107
- schema=dataframe.to_arrow().schema,
108
- )
109
- # write the dataframe to the iceberg table
110
- dataframe.write_iceberg(table, mode=self.mode)
111
-
112
- # Record metrics for successful write
113
- self.metrics.record_metric(
114
- name="iceberg_write_records",
115
- value=dataframe.count_rows(),
116
- metric_type=MetricType.COUNTER,
117
- labels={"mode": self.mode, "type": "daft"},
118
- description="Number of records written to Iceberg table from daft DataFrame",
119
- )
120
-
121
- # Record chunk metrics
122
- self.metrics.record_metric(
123
- name="iceberg_chunks_written",
124
- value=1,
125
- metric_type=MetricType.COUNTER,
126
- labels={"mode": self.mode},
127
- description="Number of chunks written to Iceberg table",
128
- )
129
- except Exception as e:
130
- # Record metrics for failed write
131
- self.metrics.record_metric(
132
- name="iceberg_write_errors",
133
- value=1,
134
- metric_type=MetricType.COUNTER,
135
- labels={"mode": self.mode, "type": "daft", "error": str(e)},
136
- description="Number of errors while writing to Iceberg table",
137
- )
138
- logger.error(f"Error writing daft dataframe to iceberg table: {str(e)}")
139
- raise e