moose-lib 0.6.90__py3-none-any.whl → 0.6.283__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. moose_lib/__init__.py +38 -3
  2. moose_lib/blocks.py +497 -37
  3. moose_lib/clients/redis_client.py +26 -14
  4. moose_lib/commons.py +94 -5
  5. moose_lib/config/config_file.py +44 -2
  6. moose_lib/config/runtime.py +137 -5
  7. moose_lib/data_models.py +451 -46
  8. moose_lib/dmv2/__init__.py +88 -60
  9. moose_lib/dmv2/_registry.py +3 -1
  10. moose_lib/dmv2/_source_capture.py +37 -0
  11. moose_lib/dmv2/consumption.py +55 -32
  12. moose_lib/dmv2/ingest_api.py +9 -2
  13. moose_lib/dmv2/ingest_pipeline.py +56 -13
  14. moose_lib/dmv2/life_cycle.py +3 -1
  15. moose_lib/dmv2/materialized_view.py +24 -14
  16. moose_lib/dmv2/moose_model.py +165 -0
  17. moose_lib/dmv2/olap_table.py +304 -119
  18. moose_lib/dmv2/registry.py +28 -3
  19. moose_lib/dmv2/sql_resource.py +16 -8
  20. moose_lib/dmv2/stream.py +241 -21
  21. moose_lib/dmv2/types.py +14 -8
  22. moose_lib/dmv2/view.py +13 -6
  23. moose_lib/dmv2/web_app.py +175 -0
  24. moose_lib/dmv2/web_app_helpers.py +96 -0
  25. moose_lib/dmv2/workflow.py +37 -9
  26. moose_lib/internal.py +537 -68
  27. moose_lib/main.py +87 -56
  28. moose_lib/query_builder.py +18 -5
  29. moose_lib/query_param.py +54 -20
  30. moose_lib/secrets.py +122 -0
  31. moose_lib/streaming/streaming_function_runner.py +266 -156
  32. moose_lib/utilities/sql.py +0 -1
  33. {moose_lib-0.6.90.dist-info → moose_lib-0.6.283.dist-info}/METADATA +19 -1
  34. moose_lib-0.6.283.dist-info/RECORD +63 -0
  35. tests/__init__.py +1 -1
  36. tests/conftest.py +38 -1
  37. tests/test_backward_compatibility.py +85 -0
  38. tests/test_cluster_validation.py +85 -0
  39. tests/test_codec.py +75 -0
  40. tests/test_column_formatting.py +80 -0
  41. tests/test_fixedstring.py +43 -0
  42. tests/test_iceberg_config.py +105 -0
  43. tests/test_int_types.py +211 -0
  44. tests/test_kafka_config.py +141 -0
  45. tests/test_materialized.py +74 -0
  46. tests/test_metadata.py +37 -0
  47. tests/test_moose.py +21 -30
  48. tests/test_moose_model.py +153 -0
  49. tests/test_olap_table_moosemodel.py +89 -0
  50. tests/test_olap_table_versioning.py +210 -0
  51. tests/test_query_builder.py +97 -9
  52. tests/test_redis_client.py +10 -3
  53. tests/test_s3queue_config.py +211 -110
  54. tests/test_secrets.py +239 -0
  55. tests/test_simple_aggregate.py +114 -0
  56. tests/test_web_app.py +227 -0
  57. moose_lib-0.6.90.dist-info/RECORD +0 -42
  58. {moose_lib-0.6.90.dist-info → moose_lib-0.6.283.dist-info}/WHEEL +0 -0
  59. {moose_lib-0.6.90.dist-info → moose_lib-0.6.283.dist-info}/top_level.txt +0 -0
@@ -30,13 +30,16 @@ from typing import Optional, Callable, Tuple, Any
30
30
 
31
31
  from moose_lib.dmv2 import get_streams, DeadLetterModel
32
32
  from moose_lib import cli_log, CliLogData, DeadLetterQueue
33
- from moose_lib.commons import EnhancedJSONEncoder, moose_management_port
33
+ from moose_lib.commons import (
34
+ EnhancedJSONEncoder,
35
+ moose_management_port,
36
+ get_kafka_consumer,
37
+ get_kafka_producer,
38
+ )
34
39
 
35
40
  # Force stdout to be unbuffered
36
41
  sys.stdout = io.TextIOWrapper(
37
- open(sys.stdout.fileno(), 'wb', 0),
38
- write_through=True,
39
- line_buffering=True
42
+ open(sys.stdout.fileno(), "wb", 0), write_through=True, line_buffering=True
40
43
  )
41
44
 
42
45
 
@@ -44,7 +47,7 @@ sys.stdout = io.TextIOWrapper(
44
47
  class KafkaTopicConfig:
45
48
  """
46
49
  Configuration for a Kafka topic including namespace support.
47
-
50
+
48
51
  Attributes:
49
52
  streaming_engine_type: The type of topic (source or target)
50
53
  name: Full topic name including namespace if present
@@ -54,6 +57,7 @@ class KafkaTopicConfig:
54
57
  namespace: Optional namespace prefix for the topic
55
58
  version: Optional version string for the topic
56
59
  """
60
+
57
61
  streaming_engine_type: str
58
62
  name: str
59
63
  partitions: int
@@ -71,31 +75,37 @@ class KafkaTopicConfig:
71
75
  if name.endswith(version_suffix):
72
76
  name = name.removesuffix(version_suffix)
73
77
  else:
74
- raise Exception(f"Version suffix {version_suffix} not found in topic name {name}")
78
+ raise Exception(
79
+ f"Version suffix {version_suffix} not found in topic name {name}"
80
+ )
75
81
 
76
82
  if self.namespace is not None and self.namespace != "":
77
83
  prefix = self.namespace + "."
78
84
  if name.startswith(prefix):
79
85
  name = name.removeprefix(prefix)
80
86
  else:
81
- raise Exception(f"Namespace prefix {prefix} not found in topic name {name}")
87
+ raise Exception(
88
+ f"Namespace prefix {prefix} not found in topic name {name}"
89
+ )
82
90
 
83
91
  return name
84
92
 
85
93
 
86
- def load_streaming_function_dmv1(function_file_dir: str, function_file_name: str) -> Tuple[type, Callable]:
94
+ def load_streaming_function_dmv1(
95
+ function_file_dir: str, function_file_name: str
96
+ ) -> Tuple[type, Callable]:
87
97
  """
88
98
  Load a DMV1 (legacy) streaming function from a Python module.
89
-
99
+
90
100
  Args:
91
101
  function_file_dir: Directory containing the streaming function module
92
102
  function_file_name: Name of the module file without .py extension
93
-
103
+
94
104
  Returns:
95
105
  Tuple of (input_type, run_function) where:
96
106
  - input_type is the type annotation of the run function's input parameter
97
107
  - run_function is the actual transformation function
98
-
108
+
99
109
  Raises:
100
110
  SystemExit: If module import fails or if multiple/no streaming functions found
101
111
  """
@@ -110,13 +120,19 @@ def load_streaming_function_dmv1(function_file_dir: str, function_file_name: str
110
120
  sys.exit(1)
111
121
 
112
122
  # Get all the named flows in the flow file and make sure the flow is of type StreamingFunction
113
- streaming_functions = [f for f in dir(module) if isinstance(getattr(module, f), streaming_function_def)]
123
+ streaming_functions = [
124
+ f for f in dir(module) if isinstance(getattr(module, f), streaming_function_def)
125
+ ]
114
126
 
115
127
  # Make sure that there is only one flow in the file
116
128
  if len(streaming_functions) != 1:
117
- cli_log(CliLogData(action="Function",
118
- message=f"Expected one streaming function in the file, but got {len(streaming_functions)}",
119
- message_type="Error"))
129
+ cli_log(
130
+ CliLogData(
131
+ action="Function",
132
+ message=f"Expected one streaming function in the file, but got {len(streaming_functions)}",
133
+ message_type="Error",
134
+ )
135
+ )
120
136
  sys.exit(1)
121
137
 
122
138
  # get the flow definition
@@ -126,26 +142,29 @@ def load_streaming_function_dmv1(function_file_dir: str, function_file_name: str
126
142
  streaming_function_run = streaming_function_def.run
127
143
 
128
144
  # get run input type that doesn't rely on the name of the input parameter
129
- run_input_type = streaming_function_run.__annotations__[list(streaming_function_run.__annotations__.keys())[0]]
145
+ run_input_type = streaming_function_run.__annotations__[
146
+ list(streaming_function_run.__annotations__.keys())[0]
147
+ ]
130
148
 
131
149
  return run_input_type, streaming_function_run
132
150
 
133
151
 
134
- def load_streaming_function_dmv2(function_file_dir: str, function_file_name: str) -> tuple[
135
- type, list[tuple[Callable, Optional[DeadLetterQueue]]]]:
152
+ def load_streaming_function_dmv2(
153
+ function_file_dir: str, function_file_name: str
154
+ ) -> tuple[type, list[tuple[Callable, Optional[DeadLetterQueue]]]]:
136
155
  """
137
156
  Load a DMV2 streaming function by finding the stream transformation that matches
138
157
  the source and target topics.
139
-
158
+
140
159
  Args:
141
160
  function_file_dir: Directory containing the main.py file
142
161
  function_file_name: Name of the main.py file (without extension)
143
-
162
+
144
163
  Returns:
145
164
  Tuple of (input_type, transformation_functions) where:
146
165
  - input_type is the Pydantic model type of the source stream
147
166
  - transformation_functions is a list of functions that transform source to target data and their dead letter queues
148
-
167
+
149
168
  Raises:
150
169
  SystemExit: If module import fails or if no matching transformation is found
151
170
  """
@@ -164,7 +183,10 @@ def load_streaming_function_dmv2(function_file_dir: str, function_file_name: str
164
183
  continue
165
184
 
166
185
  if stream.has_consumers() and target_topic is None:
167
- consumers = [(entry.consumer, entry.config.dead_letter_queue) for entry in stream.consumers]
186
+ consumers = [
187
+ (entry.consumer, entry.config.dead_letter_queue)
188
+ for entry in stream.consumers
189
+ ]
168
190
  if not consumers:
169
191
  continue
170
192
  return stream.model_type, consumers
@@ -173,52 +195,94 @@ def load_streaming_function_dmv2(function_file_dir: str, function_file_name: str
173
195
  for dest_stream_py_name, transform_entries in stream.transformations.items():
174
196
  # The source topic name should match the stream name
175
197
  # The destination topic name should match the destination stream name
176
- if source_py_stream_name == source_topic.topic_name_to_stream_name() and dest_stream_py_name == target_topic.topic_name_to_stream_name():
198
+ if (
199
+ source_py_stream_name == source_topic.topic_name_to_stream_name()
200
+ and dest_stream_py_name == target_topic.topic_name_to_stream_name()
201
+ ):
177
202
  # Found the matching transformation
178
- transformations = [(entry.transformation, entry.config.dead_letter_queue) for entry in
179
- transform_entries]
203
+ transformations = [
204
+ (entry.transformation, entry.config.dead_letter_queue)
205
+ for entry in transform_entries
206
+ ]
180
207
  if not transformations:
181
208
  continue
182
209
  return stream.model_type, transformations
183
210
 
184
211
  # If we get here, no matching transformation was found
185
- cli_log(CliLogData(
186
- action="Function",
187
- message=f"No transformation found from {source_topic.name} to {target_topic.name}",
188
- message_type="Error"
189
- ))
212
+ cli_log(
213
+ CliLogData(
214
+ action="Function",
215
+ message=f"No transformation found from {source_topic.name} to {target_topic.name}",
216
+ message_type="Error",
217
+ )
218
+ )
190
219
  sys.exit(1)
191
220
 
192
221
 
193
- parser = argparse.ArgumentParser(description='Run a streaming function')
222
+ parser = argparse.ArgumentParser(description="Run a streaming function")
194
223
 
195
- parser.add_argument('source_topic_json', type=str, help='The source topic for the streaming function')
224
+ parser.add_argument(
225
+ "source_topic_json", type=str, help="The source topic for the streaming function"
226
+ )
196
227
  # In DMV2 is the dir is the dir of the main.py or index.ts file
197
228
  # and the function_file_name is the file name of main.py or index.ts
198
229
  # In DMV1 the dir is the dir of the streaming function file
199
230
  # and the function_file_name is the file name of the streaming function without the .py extension
200
- parser.add_argument('function_file_dir', type=str, help='The dir of the streaming function file')
201
- parser.add_argument('function_file_name', type=str,
202
- help='The file name of the streaming function without the .py extension')
203
- parser.add_argument('broker', type=str, help='The broker to use for the streaming function')
204
- parser.add_argument('--target_topic_json', type=str, help='The target topic for the streaming function')
205
- parser.add_argument('--sasl_username', type=str, help='The SASL username to use for the streaming function')
206
- parser.add_argument('--sasl_password', type=str, help='The SASL password to use for the streaming function')
207
- parser.add_argument('--sasl_mechanism', type=str, help='The SASL mechanism to use for the streaming function')
208
- parser.add_argument('--security_protocol', type=str, help='The security protocol to use for the streaming function')
209
- parser.add_argument('--dmv2', action=argparse.BooleanOptionalAction, type=bool,
210
- help='Whether to use the DMV2 format for the streaming function')
231
+ parser.add_argument(
232
+ "function_file_dir", type=str, help="The dir of the streaming function file"
233
+ )
234
+ parser.add_argument(
235
+ "function_file_name",
236
+ type=str,
237
+ help="The file name of the streaming function without the .py extension",
238
+ )
239
+ parser.add_argument(
240
+ "broker", type=str, help="The broker to use for the streaming function"
241
+ )
242
+ parser.add_argument(
243
+ "--target_topic_json", type=str, help="The target topic for the streaming function"
244
+ )
245
+ parser.add_argument(
246
+ "--sasl_username",
247
+ type=str,
248
+ help="The SASL username to use for the streaming function",
249
+ )
250
+ parser.add_argument(
251
+ "--sasl_password",
252
+ type=str,
253
+ help="The SASL password to use for the streaming function",
254
+ )
255
+ parser.add_argument(
256
+ "--sasl_mechanism",
257
+ type=str,
258
+ help="The SASL mechanism to use for the streaming function",
259
+ )
260
+ parser.add_argument(
261
+ "--security_protocol",
262
+ type=str,
263
+ help="The security protocol to use for the streaming function",
264
+ )
265
+ parser.add_argument(
266
+ "--dmv2",
267
+ action=argparse.BooleanOptionalAction,
268
+ type=bool,
269
+ help="Whether to use the DMV2 format for the streaming function",
270
+ )
211
271
 
212
272
  args: argparse.Namespace = parser.parse_args()
213
273
 
214
274
  for arg in vars(args):
215
275
  value = getattr(args, arg)
216
- if 'password' in arg and value is not None:
217
- value = '******'
276
+ if "password" in arg and value is not None:
277
+ value = "******"
218
278
  print(arg, value)
219
279
 
220
280
  source_topic = KafkaTopicConfig(**json.loads(args.source_topic_json))
221
- target_topic = KafkaTopicConfig(**json.loads(args.target_topic_json)) if args.target_topic_json else None
281
+ target_topic = (
282
+ KafkaTopicConfig(**json.loads(args.target_topic_json))
283
+ if args.target_topic_json
284
+ else None
285
+ )
222
286
  function_file_dir = args.function_file_dir
223
287
  function_file_name = args.function_file_name
224
288
  broker = args.broker
@@ -226,24 +290,36 @@ sasl_mechanism = args.sasl_mechanism
226
290
 
227
291
  # Setup SASL config w/ supported mechanisms
228
292
  if args.sasl_mechanism is not None:
229
- if args.sasl_mechanism not in ['PLAIN', 'SCRAM-SHA-256', 'SCRAM-SHA-512']:
293
+ if args.sasl_mechanism not in ["PLAIN", "SCRAM-SHA-256", "SCRAM-SHA-512"]:
230
294
  raise Exception(f"Unsupported SASL mechanism: {args.sasl_mechanism}")
231
295
  if args.sasl_username is None or args.sasl_password is None:
232
- raise Exception("SASL username and password must be provided if a SASL mechanism is specified")
296
+ raise Exception(
297
+ "SASL username and password must be provided if a SASL mechanism is specified"
298
+ )
233
299
  if args.security_protocol is None:
234
- raise Exception("Security protocol must be provided if a SASL mechanism is specified")
300
+ raise Exception(
301
+ "Security protocol must be provided if a SASL mechanism is specified"
302
+ )
235
303
 
236
304
  sasl_config = {
237
- 'username': args.sasl_username,
238
- 'password': args.sasl_password,
239
- 'mechanism': args.sasl_mechanism
305
+ "username": args.sasl_username,
306
+ "password": args.sasl_password,
307
+ "mechanism": args.sasl_mechanism,
240
308
  }
241
309
 
242
310
  # We use flow- instead of function- because that's what the ACLs in boreal are linked with
243
- # When migrating - make sure the ACLs are updated to use the new prefix.
311
+ # When migrating - make sure the ACLs are updated to use the new prefix.
244
312
  # And make sure the prefixes are the same in the ts-moose-lib and py-moose-lib
245
- streaming_function_id = f'flow-{source_topic.name}-{target_topic.name}' if target_topic else f'flow-{source_topic.name}'
246
- log_prefix = f"{source_topic.name} -> {target_topic.name}" if target_topic else f"{source_topic.name} -> None"
313
+ streaming_function_id = (
314
+ f"flow-{source_topic.name}-{target_topic.name}"
315
+ if target_topic
316
+ else f"flow-{source_topic.name}"
317
+ )
318
+ log_prefix = (
319
+ f"{source_topic.name} -> {target_topic.name}"
320
+ if target_topic
321
+ else f"{source_topic.name} (consumer)"
322
+ )
247
323
 
248
324
 
249
325
  def log(msg: str) -> None:
@@ -260,13 +336,13 @@ def error(msg: str) -> None:
260
336
  def parse_input(run_input_type: type, json_input: dict) -> Any:
261
337
  """
262
338
  Parse JSON input data into the appropriate input type for the streaming function.
263
-
339
+
264
340
  Handles Pydantic models, nested dataclass structures and lists of dataclasses.
265
-
341
+
266
342
  Args:
267
343
  run_input_type: The type to parse the JSON into
268
344
  json_input: The JSON data as a Python dict
269
-
345
+
270
346
  Returns:
271
347
  An instance of run_input_type populated with the JSON data
272
348
  """
@@ -276,7 +352,12 @@ def parse_input(run_input_type: type, json_input: dict) -> Any:
276
352
  return cls.model_validate(data)
277
353
  elif dataclasses.is_dataclass(cls):
278
354
  field_types = {f.name: f.type for f in dataclasses.fields(cls)}
279
- return cls(**{name: deserialize(data.get(name), field_types[name]) for name in field_types})
355
+ return cls(
356
+ **{
357
+ name: deserialize(data.get(name), field_types[name])
358
+ for name in field_types
359
+ }
360
+ )
280
361
  elif isinstance(data, list):
281
362
  return [deserialize(item, cls.__args__[0]) for item in data]
282
363
  else:
@@ -288,69 +369,66 @@ def parse_input(run_input_type: type, json_input: dict) -> Any:
288
369
  def create_consumer() -> KafkaConsumer:
289
370
  """
290
371
  Create a Kafka consumer configured for the source topic.
291
-
372
+
292
373
  Handles SASL authentication if configured.
293
-
374
+ Disables auto-commit to ensure at-least-once processing semantics.
375
+
294
376
  Returns:
295
377
  Configured KafkaConsumer instance
296
378
  """
297
- if sasl_config['mechanism'] is not None:
298
- return KafkaConsumer(
299
- source_topic.name,
300
- client_id="python_streaming_function_consumer",
301
- group_id=streaming_function_id,
302
- bootstrap_servers=broker,
303
- sasl_plain_username=sasl_config['username'],
304
- sasl_plain_password=sasl_config['password'],
305
- sasl_mechanism=sasl_config['mechanism'],
306
- security_protocol=args.security_protocol,
307
- # consumer_timeout_ms=10000,
308
- value_deserializer=lambda m: json.loads(m.decode('utf-8'))
309
- )
310
- else:
311
- log("No sasl mechanism specified. Using default consumer.")
312
- return KafkaConsumer(
313
- source_topic.name,
314
- client_id="python_streaming_function_consumer",
315
- group_id=streaming_function_id,
316
- bootstrap_servers=broker,
317
- # consumer_timeout_ms=10000,
318
- value_deserializer=lambda m: json.loads(m.decode('utf-8'))
319
- )
379
+
380
+ def _sr_json_deserializer(m: bytes):
381
+ if m is None:
382
+ return None
383
+ # Schema Registry JSON envelope: 0x00 + 4-byte schema ID (big-endian) + JSON
384
+ if len(m) >= 5 and m[0] == 0x00:
385
+ m = m[5:]
386
+ return json.loads(m.decode("utf-8"))
387
+
388
+ kwargs = dict(
389
+ broker=broker,
390
+ client_id="python_streaming_function_consumer",
391
+ group_id=streaming_function_id,
392
+ value_deserializer=_sr_json_deserializer,
393
+ sasl_username=sasl_config.get("username"),
394
+ sasl_password=sasl_config.get("password"),
395
+ sasl_mechanism=sasl_config.get("mechanism"),
396
+ security_protocol=args.security_protocol,
397
+ enable_auto_commit=False, # Disable auto-commit for at-least-once semantics
398
+ auto_offset_reset="earliest",
399
+ )
400
+ consumer = get_kafka_consumer(**kwargs)
401
+ return consumer
320
402
 
321
403
 
322
404
  def create_producer() -> Optional[KafkaProducer]:
323
405
  """
324
406
  Create a Kafka producer configured for the target topic.
325
-
407
+
326
408
  Handles SASL authentication if configured and sets appropriate message size limits.
327
-
409
+
328
410
  Returns:
329
411
  Configured KafkaProducer instance
330
412
  """
331
- max_request_size = KafkaProducer.DEFAULT_CONFIG['max_request_size'] if target_topic is None \
413
+ max_request_size = (
414
+ KafkaProducer.DEFAULT_CONFIG["max_request_size"]
415
+ if target_topic is None
332
416
  else target_topic.max_message_bytes
333
- if sasl_config['mechanism'] is not None:
334
- return KafkaProducer(
335
- bootstrap_servers=broker,
336
- sasl_plain_username=sasl_config['username'],
337
- sasl_plain_password=sasl_config['password'],
338
- sasl_mechanism=sasl_config['mechanism'],
339
- security_protocol=args.security_protocol,
340
- max_request_size=max_request_size
341
- )
342
- log("No sasl mechanism specified. Using default producer.")
343
- return KafkaProducer(
344
- bootstrap_servers=broker,
345
- max_in_flight_requests_per_connection=1,
346
- max_request_size=max_request_size
417
+ )
418
+ return get_kafka_producer(
419
+ broker=broker,
420
+ sasl_username=sasl_config.get("username"),
421
+ sasl_password=sasl_config.get("password"),
422
+ sasl_mechanism=sasl_config.get("mechanism"),
423
+ security_protocol=args.security_protocol,
424
+ max_request_size=max_request_size,
347
425
  )
348
426
 
349
427
 
350
428
  def main():
351
429
  """
352
430
  Main entry point for the streaming function runner.
353
-
431
+
354
432
  This function:
355
433
  1. Loads the appropriate streaming function (DMV1 or DMV2)
356
434
  2. Sets up metrics reporting thread and message processing thread
@@ -361,18 +439,11 @@ def main():
361
439
  # Shared state for metrics and control
362
440
  running = threading.Event()
363
441
  running.set() # Start in running state
364
- metrics = {
365
- 'count_in': 0,
366
- 'count_out': 0,
367
- 'bytes_count': 0
368
- }
442
+ metrics = {"count_in": 0, "count_out": 0, "bytes_count": 0}
369
443
  metrics_lock = threading.Lock()
370
444
 
371
445
  # Shared references for cleanup
372
- kafka_refs = {
373
- 'consumer': None,
374
- 'producer': None
375
- }
446
+ kafka_refs = {"consumer": None, "producer": None}
376
447
 
377
448
  def send_message_metrics():
378
449
  while running.is_set():
@@ -381,42 +452,44 @@ def main():
381
452
  requests.post(
382
453
  f"http://localhost:{moose_management_port}/metrics-logs",
383
454
  json={
384
- 'timestamp': datetime.now(timezone.utc).isoformat(),
385
- 'count_in': metrics['count_in'],
386
- 'count_out': metrics['count_out'],
387
- 'bytes': metrics['bytes_count'],
388
- 'function_name': log_prefix
389
- }
455
+ "timestamp": datetime.now(timezone.utc).isoformat(),
456
+ "count_in": metrics["count_in"],
457
+ "count_out": metrics["count_out"],
458
+ "bytes": metrics["bytes_count"],
459
+ "function_name": log_prefix,
460
+ },
390
461
  )
391
- metrics['count_in'] = 0
392
- metrics['count_out'] = 0
393
- metrics['bytes_count'] = 0
462
+ metrics["count_in"] = 0
463
+ metrics["count_out"] = 0
464
+ metrics["bytes_count"] = 0
394
465
 
395
466
  def process_messages():
396
467
  try:
397
468
  streaming_function_input_type = None
398
469
  streaming_function_callables = None
399
470
  if args.dmv2:
400
- streaming_function_input_type, streaming_function_callables = load_streaming_function_dmv2(
401
- function_file_dir, function_file_name)
471
+ streaming_function_input_type, streaming_function_callables = (
472
+ load_streaming_function_dmv2(function_file_dir, function_file_name)
473
+ )
402
474
  else:
403
- streaming_function_input_type, streaming_function_callable = load_streaming_function_dmv1(
404
- function_file_dir, function_file_name)
475
+ streaming_function_input_type, streaming_function_callable = (
476
+ load_streaming_function_dmv1(function_file_dir, function_file_name)
477
+ )
405
478
 
406
479
  streaming_function_callables = [(streaming_function_callable, None)]
407
480
 
408
481
  needs_producer = target_topic is not None or any(
409
- pair[1] is not None for pair in streaming_function_callables)
482
+ pair[1] is not None for pair in streaming_function_callables
483
+ )
410
484
 
411
485
  # Initialize Kafka connections in the processing thread
412
486
  consumer = create_consumer()
413
487
  producer = create_producer() if needs_producer else None
414
488
 
415
489
  # Store references for cleanup
416
- kafka_refs['consumer'] = consumer
417
- kafka_refs['producer'] = producer
490
+ kafka_refs["consumer"] = consumer
491
+ kafka_refs["producer"] = producer
418
492
 
419
- # Subscribe to topic
420
493
  consumer.subscribe([source_topic.name])
421
494
 
422
495
  log("Kafka consumer and producer initialized in processing thread")
@@ -432,17 +505,27 @@ def main():
432
505
  # Process each partition's messages
433
506
  for partition_messages in messages.values():
434
507
  for message in partition_messages:
508
+ log(
509
+ f"Message partition={message.partition} offset={message.offset}"
510
+ )
435
511
  if not running.is_set():
436
512
  return
437
513
 
438
514
  # Parse the message into the input type
439
- input_data = parse_input(streaming_function_input_type, message.value)
515
+ input_data = parse_input(
516
+ streaming_function_input_type, message.value
517
+ )
440
518
 
441
519
  # Run the flow
442
520
  all_outputs = []
443
- for (streaming_function_callable, dlq) in streaming_function_callables:
521
+ for (
522
+ streaming_function_callable,
523
+ dlq,
524
+ ) in streaming_function_callables:
444
525
  try:
445
- output_data = streaming_function_callable(input_data)
526
+ output_data = streaming_function_callable(
527
+ input_data
528
+ )
446
529
  except Exception as e:
447
530
  traceback.print_exc()
448
531
  if dlq is not None:
@@ -451,21 +534,27 @@ def main():
451
534
  error_message=str(e),
452
535
  error_type=e.__class__.__name__,
453
536
  failed_at=datetime.now(timezone.utc),
454
- source="transform"
537
+ source="transform",
538
+ )
539
+ record = dead_letter.model_dump_json().encode(
540
+ "utf-8"
455
541
  )
456
- record = dead_letter.model_dump_json().encode('utf-8')
457
542
  producer.send(dlq.name, record).get()
458
- cli_log(CliLogData(
459
- action="DeadLetter",
460
- message=f"Sent message to DLQ {dlq.name}: {str(e)}",
461
- message_type=CliLogData.ERROR
462
- ))
543
+ cli_log(
544
+ CliLogData(
545
+ action="DeadLetter",
546
+ message=f"Sent message to DLQ {dlq.name}: {str(e)}",
547
+ message_type=CliLogData.ERROR,
548
+ )
549
+ )
463
550
  else:
464
- cli_log(CliLogData(
465
- action="Function",
466
- message=f"Error processing message (no DLQ configured): {str(e)}",
467
- message_type=CliLogData.ERROR
468
- ))
551
+ cli_log(
552
+ CliLogData(
553
+ action="Function",
554
+ message=f"Error processing message (no DLQ configured): {str(e)}",
555
+ message_type=CliLogData.ERROR,
556
+ )
557
+ )
469
558
  # Skip to the next transformation or message
470
559
  continue
471
560
 
@@ -474,29 +563,50 @@ def main():
474
563
  continue
475
564
 
476
565
  # Handle streaming function returning an array or a single object
477
- output_data_list = output_data if isinstance(output_data, list) else [output_data]
566
+ output_data_list = (
567
+ output_data
568
+ if isinstance(output_data, list)
569
+ else [output_data]
570
+ )
478
571
  all_outputs.extend(output_data_list)
479
572
 
480
573
  with metrics_lock:
481
- metrics['count_in'] += len(output_data_list)
574
+ metrics["count_in"] += len(output_data_list)
482
575
 
483
- cli_log(CliLogData(action="Received",
484
- message=f'{log_prefix} {len(output_data_list)} message(s)'))
576
+ cli_log(
577
+ CliLogData(
578
+ action="Received",
579
+ message=f"{log_prefix} {len(output_data_list)} message(s)",
580
+ )
581
+ )
485
582
 
486
583
  if producer is not None:
487
584
  for item in all_outputs:
488
585
  # Ignore flow function returning null
489
586
  if item is not None:
490
- record = json.dumps(item, cls=EnhancedJSONEncoder).encode('utf-8')
587
+ record = json.dumps(
588
+ item, cls=EnhancedJSONEncoder
589
+ ).encode("utf-8")
491
590
 
492
591
  producer.send(target_topic.name, record)
493
592
 
494
593
  with metrics_lock:
495
- metrics['bytes_count'] += len(record)
496
- metrics['count_out'] += 1
594
+ metrics["bytes_count"] += len(record)
595
+ metrics["count_out"] += 1
596
+
597
+ # Flush producer to ensure messages are sent before committing
598
+ producer.flush()
599
+
600
+ # Commit offset only after successful processing and flushing
601
+ # This ensures at-least-once delivery semantics
602
+ consumer.commit()
497
603
 
498
604
  except Exception as e:
499
- cli_log(CliLogData(action="Function", message=str(e), message_type="Error"))
605
+ cli_log(
606
+ CliLogData(
607
+ action="Function", message=str(e), message_type="Error"
608
+ )
609
+ )
500
610
  if not running.is_set():
501
611
  break
502
612
  # Add a small delay before retrying on error
@@ -555,16 +665,16 @@ def main():
555
665
  log("Processing thread did not exit cleanly")
556
666
 
557
667
  # Clean up Kafka resources regardless of thread state
558
- if kafka_refs['consumer']:
668
+ if kafka_refs["consumer"]:
559
669
  try:
560
- kafka_refs['consumer'].close()
670
+ kafka_refs["consumer"].close()
561
671
  except Exception as e:
562
672
  log(f"Error closing consumer: {e}")
563
673
 
564
- if kafka_refs['producer'] and kafka_refs['producer'] is not None:
674
+ if kafka_refs["producer"] and kafka_refs["producer"] is not None:
565
675
  try:
566
- kafka_refs['producer'].flush()
567
- kafka_refs['producer'].close()
676
+ kafka_refs["producer"].flush()
677
+ kafka_refs["producer"].close()
568
678
  except Exception as e:
569
679
  log(f"Error closing producer: {e}")
570
680