atlan-application-sdk 0.1.1rc43__py3-none-any.whl → 0.1.1rc44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,15 @@
1
1
  import os
2
2
  from datetime import datetime
3
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
4
4
 
5
5
  import orjson
6
6
  from temporalio import activity
7
7
 
8
- from application_sdk.activities.common.utils import get_object_store_prefix
8
+ from application_sdk.activities.common.models import ActivityStatistics
9
9
  from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
10
10
  from application_sdk.observability.logger_adaptor import get_logger
11
11
  from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
12
12
  from application_sdk.outputs import Output
13
- from application_sdk.services.objectstore import ObjectStore
14
13
 
15
14
  logger = get_logger(__name__)
16
15
  activity.logger = logger
@@ -20,22 +19,6 @@ if TYPE_CHECKING:
20
19
  import pandas as pd
21
20
 
22
21
 
23
- def path_gen(chunk_start: int | None, chunk_count: int) -> str:
24
- """Generate a file path for a chunk.
25
-
26
- Args:
27
- chunk_start (int | None): Starting index of the chunk, or None for single chunk.
28
- chunk_count (int): Total number of chunks.
29
-
30
- Returns:
31
- str: Generated file path for the chunk.
32
- """
33
- if chunk_start is None:
34
- return f"{str(chunk_count)}.json"
35
- else:
36
- return f"chunk-{chunk_start}-part{chunk_count}.json"
37
-
38
-
39
22
  def convert_datetime_to_epoch(data: Any) -> Any:
40
23
  """Convert datetime objects to epoch timestamps in milliseconds.
41
24
 
@@ -68,7 +51,6 @@ class JsonOutput(Output):
68
51
  Attributes:
69
52
  output_path (Optional[str]): Base path where JSON files will be written.
70
53
  output_suffix (str): Suffix added to file paths when uploading to object store.
71
- output_prefix (Optional[str]): Prefix for output files and object store paths.
72
54
  typename (Optional[str]): Type identifier for the data being written.
73
55
  chunk_start (Optional[int]): Starting index for chunk numbering.
74
56
  buffer_size (int): Size of the write buffer in bytes.
@@ -79,18 +61,18 @@ class JsonOutput(Output):
79
61
  data before writing.
80
62
  """
81
63
 
64
+ _EXTENSION = ".json"
65
+
82
66
  def __init__(
83
67
  self,
84
68
  output_suffix: str,
85
69
  output_path: Optional[str] = None,
86
- output_prefix: Optional[str] = None,
87
70
  typename: Optional[str] = None,
88
71
  chunk_start: Optional[int] = None,
89
- buffer_size: int = 100000,
90
- chunk_size: Optional[int] = None,
72
+ buffer_size: int = 5000,
73
+ chunk_size: Optional[int] = 50000, # to limit the memory usage on upload
91
74
  total_record_count: int = 0,
92
75
  chunk_count: int = 0,
93
- path_gen: Callable[[int | None, int], str] = path_gen,
94
76
  start_marker: Optional[str] = None,
95
77
  end_marker: Optional[str] = None,
96
78
  retain_local_copy: bool = False,
@@ -101,7 +83,6 @@ class JsonOutput(Output):
101
83
  Args:
102
84
  output_path (str): Path where JSON files will be written.
103
85
  output_suffix (str): Prefix for files when uploading to object store.
104
- output_prefix (Optional[str], optional): Prefix for files where the files will be written and uploaded.
105
86
  chunk_start (Optional[int], optional): Starting index for chunk numbering.
106
87
  Defaults to None.
107
88
  buffer_size (int, optional): Size of the buffer in bytes.
@@ -112,29 +93,27 @@ class JsonOutput(Output):
112
93
  Defaults to 0.
113
94
  chunk_count (int, optional): Initial chunk count.
114
95
  Defaults to 0.
115
- path_gen (Callable, optional): Function to generate file paths.
116
- Defaults to path_gen function.
117
96
  retain_local_copy (bool, optional): Whether to retain the local copy of the files.
118
97
  Defaults to False.
119
98
  """
120
99
  self.output_path = output_path
121
100
  self.output_suffix = output_suffix
122
- self.output_prefix = output_prefix
123
101
  self.typename = typename
124
102
  self.chunk_start = chunk_start
125
103
  self.total_record_count = total_record_count
126
104
  self.chunk_count = chunk_count
127
105
  self.buffer_size = buffer_size
128
- self.chunk_size = chunk_size or 100000
106
+ self.chunk_size = chunk_size or 50000 # to limit the memory usage on upload
129
107
  self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
130
108
  self.current_buffer_size = 0
131
109
  self.current_buffer_size_bytes = 0 # Track estimated buffer size in bytes
132
110
  self.max_file_size_bytes = int(
133
111
  DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
134
112
  ) # 90% of DAPR limit as safety buffer
135
- self.path_gen = path_gen
136
113
  self.start_marker = start_marker
137
114
  self.end_marker = end_marker
115
+ self.partitions = []
116
+ self.chunk_part = 0
138
117
  self.metrics = get_metrics()
139
118
  self.retain_local_copy = retain_local_copy
140
119
 
@@ -146,81 +125,8 @@ class JsonOutput(Output):
146
125
  self.output_path = os.path.join(self.output_path, typename)
147
126
  os.makedirs(self.output_path, exist_ok=True)
148
127
 
149
- # For Query Extraction
150
- if self.start_marker and self.end_marker:
151
- self.path_gen = (
152
- lambda chunk_start,
153
- chunk_count: f"{self.start_marker}_{self.end_marker}.json"
154
- )
155
-
156
- async def write_dataframe(self, dataframe: "pd.DataFrame"):
157
- """Write a pandas DataFrame to JSON files.
158
-
159
- This method writes the DataFrame to JSON files, potentially splitting it
160
- into chunks based on chunk_size and buffer_size settings.
161
-
162
- Args:
163
- dataframe (pd.DataFrame): The DataFrame to write.
164
-
165
- Note:
166
- If the DataFrame is empty, the method returns without writing.
167
- """
168
- if len(dataframe) == 0:
169
- return
170
-
171
- try:
172
- # Split the DataFrame into chunks
173
- partition = (
174
- self.chunk_size
175
- if self.chunk_start is None
176
- else min(self.chunk_size, self.buffer_size)
177
- )
178
- chunks = [
179
- dataframe[i : i + partition]
180
- for i in range(0, len(dataframe), partition)
181
- ]
182
-
183
- for chunk in chunks:
184
- # Estimate size of this chunk
185
- chunk_size_bytes = self.estimate_dataframe_file_size(chunk, "json")
186
-
187
- # Check if adding this chunk would exceed size limit
188
- if (
189
- self.current_buffer_size_bytes + chunk_size_bytes
190
- > self.max_file_size_bytes
191
- and self.current_buffer_size > 0
192
- ):
193
- # Flush current buffer before adding this chunk
194
- await self._flush_buffer()
195
-
196
- self.buffer.append(chunk)
197
- self.current_buffer_size += len(chunk)
198
- self.current_buffer_size_bytes += chunk_size_bytes
199
-
200
- if self.current_buffer_size >= partition:
201
- await self._flush_buffer()
202
-
203
- await self._flush_buffer()
204
-
205
- # Record metrics for successful write
206
- self.metrics.record_metric(
207
- name="json_write_records",
208
- value=len(dataframe),
209
- metric_type=MetricType.COUNTER,
210
- labels={"type": "pandas"},
211
- description="Number of records written to JSON files from pandas DataFrame",
212
- )
213
-
214
- except Exception as e:
215
- # Record metrics for failed write
216
- self.metrics.record_metric(
217
- name="json_write_errors",
218
- value=1,
219
- metric_type=MetricType.COUNTER,
220
- labels={"type": "pandas", "error": str(e)},
221
- description="Number of errors while writing to JSON files",
222
- )
223
- logger.error(f"Error writing dataframe to json: {str(e)}")
128
+ if self.chunk_start:
129
+ self.chunk_count = self.chunk_start + self.chunk_count
224
130
 
225
131
  async def write_daft_dataframe(
226
132
  self,
@@ -249,6 +155,9 @@ class JsonOutput(Output):
249
155
  Daft does not have built-in JSON writing support, so we are using orjson.
250
156
  """
251
157
  try:
158
+ if self.chunk_start is None:
159
+ self.chunk_part = 0
160
+
252
161
  buffer = []
253
162
  for row in dataframe.iter_rows():
254
163
  self.total_record_count += 1
@@ -261,17 +170,27 @@ class JsonOutput(Output):
261
170
  # Serialize the row and add it to the buffer
262
171
  serialized_row = orjson.dumps(
263
172
  cleaned_row, option=orjson.OPT_APPEND_NEWLINE
264
- ).decode("utf-8")
173
+ )
265
174
  buffer.append(serialized_row)
175
+ self.current_buffer_size += 1
266
176
  self.current_buffer_size_bytes += len(serialized_row)
267
- if (self.chunk_size and len(buffer) >= self.chunk_size) or (
268
- self.current_buffer_size_bytes > self.max_file_size_bytes
177
+
178
+ # If the buffer size is reached append to the file and clear the buffer
179
+ if self.current_buffer_size >= self.buffer_size:
180
+ await self.flush_daft_buffer(buffer, self.chunk_part)
181
+
182
+ if self.current_buffer_size_bytes > self.max_file_size_bytes or (
183
+ self.total_record_count > 0
184
+ and self.total_record_count % self.chunk_size == 0
269
185
  ):
270
- await self.flush_daft_buffer(buffer)
186
+ output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part, self.start_marker, self.end_marker)}"
187
+ if os.path.exists(output_file_name):
188
+ await self._upload_file(output_file_name)
189
+ self.chunk_part += 1
271
190
 
272
191
  # Write any remaining rows in the buffer
273
- if buffer:
274
- await self.flush_daft_buffer(buffer)
192
+ if self.current_buffer_size > 0:
193
+ await self.flush_daft_buffer(buffer, self.chunk_part)
275
194
 
276
195
  # Record metrics for successful write
277
196
  self.metrics.record_metric(
@@ -281,14 +200,6 @@ class JsonOutput(Output):
281
200
  labels={"type": "daft"},
282
201
  description="Number of records written to JSON files from daft DataFrame",
283
202
  )
284
-
285
- # Push files to the object store
286
- await ObjectStore.upload_prefix(
287
- source=self.output_path,
288
- destination=get_object_store_prefix(self.output_path),
289
- retain_local_copy=self.retain_local_copy,
290
- )
291
-
292
203
  except Exception as e:
293
204
  # Record metrics for failed write
294
205
  self.metrics.record_metric(
@@ -300,22 +211,20 @@ class JsonOutput(Output):
300
211
  )
301
212
  logger.error(f"Error writing daft dataframe to json: {str(e)}")
302
213
 
303
- async def flush_daft_buffer(self, buffer: List[str]):
214
+ async def flush_daft_buffer(self, buffer: List[str], chunk_part: int):
304
215
  """Flush the current buffer to a JSON file.
305
216
 
306
217
  This method combines all DataFrames in the buffer, writes them to a JSON file,
307
218
  and uploads the file to the object store.
308
219
  """
309
- self.chunk_count += 1
310
220
  output_file_name = (
311
- f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count)}"
221
+ f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
312
222
  )
313
- with open(output_file_name, "w") as f:
223
+ with open(output_file_name, "ab+") as f:
314
224
  f.writelines(buffer)
315
225
  buffer.clear() # Clear the buffer
316
226
 
317
227
  self.current_buffer_size = 0
318
- self.current_buffer_size_bytes = 0
319
228
 
320
229
  # Record chunk metrics
321
230
  self.metrics.record_metric(
@@ -326,67 +235,34 @@ class JsonOutput(Output):
326
235
  description="Number of chunks written to JSON files",
327
236
  )
328
237
 
329
- async def _flush_buffer(self):
330
- """Flush the current buffer to a JSON file.
331
-
332
- This method combines all DataFrames in the buffer, writes them to a JSON file,
333
- and uploads the file to the object store.
238
+ async def write_chunk(self, chunk: "pd.DataFrame", file_name: str):
239
+ """Write a chunk to a JSON file.
334
240
 
335
- Note:
336
- If the buffer is empty or has no records, the method returns without writing.
241
+ This method writes a chunk to a JSON file and uploads the file to the object store.
337
242
  """
338
- import pandas as pd
243
+ mode = "w" if not os.path.exists(file_name) else "a"
244
+ chunk.to_json(file_name, orient="records", lines=True, mode=mode)
339
245
 
340
- if not self.buffer or not self.current_buffer_size:
341
- return
246
+ async def get_statistics(
247
+ self, typename: Optional[str] = None
248
+ ) -> ActivityStatistics:
249
+ """Get the statistics of the JSON files.
342
250
 
343
- if not all(isinstance(df, pd.DataFrame) for df in self.buffer):
344
- raise TypeError(
345
- "_flush_buffer encountered non-DataFrame elements in buffer. This should not happen."
251
+ This method returns the statistics of the JSON files.
252
+ """
253
+ # Finally upload the final file
254
+ if self.current_buffer_size_bytes > 0:
255
+ output_file_name = (
256
+ f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part)}"
346
257
  )
258
+ if os.path.exists(output_file_name):
259
+ await self._upload_file(output_file_name)
260
+ self.chunk_part += 1
347
261
 
348
- try:
349
- # Now it's safe to cast for pd.concat
350
- pd_buffer: List[pd.DataFrame] = self.buffer # type: ignore
351
- combined_dataframe = pd.concat(pd_buffer)
352
-
353
- # Write DataFrame to JSON file
354
- if not combined_dataframe.empty:
355
- self.chunk_count += 1
356
- self.total_record_count += len(combined_dataframe)
357
- output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count)}"
358
- combined_dataframe.to_json(
359
- output_file_name, orient="records", lines=True
360
- )
361
-
362
- # Record chunk metrics
363
- self.metrics.record_metric(
364
- name="json_chunks_written",
365
- value=1,
366
- metric_type=MetricType.COUNTER,
367
- labels={"type": "pandas"},
368
- description="Number of chunks written to JSON files",
369
- )
370
-
371
- # Push the file to the object store
372
- await ObjectStore.upload_file(
373
- source=output_file_name,
374
- destination=get_object_store_prefix(output_file_name),
375
- retain_local_copy=self.retain_local_copy,
376
- )
377
-
378
- self.buffer.clear()
379
- self.current_buffer_size = 0
380
- self.current_buffer_size_bytes = 0
262
+ # If chunk_start is set we don't want to increment the chunk_count
263
+ # Since it should only increment the chunk_part in this case
264
+ if self.chunk_start is None:
265
+ self.chunk_count += 1
266
+ self.partitions.append(self.chunk_part)
381
267
 
382
- except Exception as e:
383
- # Record metrics for failed write
384
- self.metrics.record_metric(
385
- name="json_write_errors",
386
- value=1,
387
- metric_type=MetricType.COUNTER,
388
- labels={"type": "pandas", "error": str(e)},
389
- description="Number of errors while writing to JSON files",
390
- )
391
- logger.error(f"Error flushing buffer to json: {str(e)}")
392
- raise e
268
+ return await super().get_statistics(typename)