airbyte-cdk 0.67.0__py3-none-any.whl → 0.67.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. airbyte_cdk/sources/abstract_source.py +30 -69
  2. airbyte_cdk/sources/connector_state_manager.py +12 -26
  3. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +552 -524
  4. airbyte_cdk/sources/file_based/config/csv_format.py +2 -0
  5. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
  6. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
  7. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
  8. airbyte_cdk/sources/streams/__init__.py +2 -2
  9. airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
  10. airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
  11. airbyte_cdk/sources/streams/core.py +36 -34
  12. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/METADATA +3 -2
  13. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/RECORD +31 -31
  14. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
  15. unit_tests/sources/file_based/config/test_csv_format.py +6 -1
  16. unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
  17. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
  18. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
  19. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
  20. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
  21. unit_tests/sources/file_based/test_scenarios.py +2 -2
  22. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
  23. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
  24. unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
  25. unit_tests/sources/streams/test_stream_read.py +221 -11
  26. unit_tests/sources/test_abstract_source.py +142 -130
  27. unit_tests/sources/test_connector_state_manager.py +3 -124
  28. unit_tests/sources/test_source.py +18 -14
  29. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/LICENSE.txt +0 -0
  30. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/WHEEL +0 -0
  31. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/top_level.txt +0 -0
@@ -284,61 +284,6 @@ def test_get_stream_state(input_state, stream_name, namespace, expected_state):
284
284
  assert actual_state == expected_state
285
285
 
286
286
 
287
- @pytest.mark.parametrize(
288
- "input_state, expected_legacy_state, expected_error",
289
- [
290
- pytest.param(
291
- [AirbyteStateMessage(type=AirbyteStateType.LEGACY, data={"actresses": {"id": "seehorn_rhea"}})],
292
- {"actresses": {"id": "seehorn_rhea"}},
293
- does_not_raise(),
294
- id="test_get_legacy_legacy_state_message",
295
- ),
296
- pytest.param(
297
- [
298
- AirbyteStateMessage(
299
- type=AirbyteStateType.STREAM,
300
- stream=AirbyteStreamState(
301
- stream_descriptor=StreamDescriptor(name="actresses", namespace="public"),
302
- stream_state=AirbyteStateBlob.parse_obj({"id": "seehorn_rhea"}),
303
- ),
304
- )
305
- ],
306
- {"actresses": {"id": "seehorn_rhea"}},
307
- does_not_raise(),
308
- id="test_get_legacy_from_stream_state",
309
- ),
310
- pytest.param(
311
- {
312
- "actors": {"created_at": "1962-10-22"},
313
- "actresses": {"id": "seehorn_rhea"},
314
- },
315
- {"actors": {"created_at": "1962-10-22"}, "actresses": {"id": "seehorn_rhea"}},
316
- does_not_raise(),
317
- id="test_get_legacy_from_legacy_state_blob",
318
- ),
319
- pytest.param(
320
- [
321
- AirbyteStateMessage(
322
- type=AirbyteStateType.STREAM,
323
- stream=AirbyteStreamState(
324
- stream_descriptor=StreamDescriptor(name="actresses", namespace="public"),
325
- stream_state=None,
326
- ),
327
- )
328
- ],
329
- {"actresses": {}},
330
- does_not_raise(),
331
- id="test_get_legacy_from_stream_state",
332
- ),
333
- ],
334
- )
335
- def test_get_legacy_state(input_state, expected_legacy_state, expected_error):
336
- with expected_error:
337
- state_manager = ConnectorStateManager({}, input_state)
338
- actual_legacy_state = state_manager._get_legacy_state()
339
- assert actual_legacy_state == expected_legacy_state
340
-
341
-
342
287
  def test_get_state_returns_deep_copy():
343
288
  input_state = [
344
289
  AirbyteStateMessage(
@@ -422,11 +367,10 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
422
367
  assert state_manager.per_stream_states[
423
368
  HashableStreamDescriptor(name=update_name, namespace=update_namespace)
424
369
  ] == AirbyteStateBlob.parse_obj(update_value)
425
- assert state_manager._get_legacy_state() == expected_legacy_state
426
370
 
427
371
 
428
372
  @pytest.mark.parametrize(
429
- "start_state, update_name, update_namespace, send_per_stream, expected_state_message",
373
+ "start_state, update_name, update_namespace, expected_state_message",
430
374
  [
431
375
  pytest.param(
432
376
  [
@@ -447,7 +391,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
447
391
  ],
448
392
  "episodes",
449
393
  "public",
450
- True,
451
394
  AirbyteMessage(
452
395
  type=MessageType.STATE,
453
396
  state=AirbyteStateMessage(
@@ -456,7 +399,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
456
399
  stream_descriptor=StreamDescriptor(name="episodes", namespace="public"),
457
400
  stream_state=AirbyteStateBlob.parse_obj({"created_at": "2022_05_22"}),
458
401
  ),
459
- data={"episodes": {"created_at": "2022_05_22"}, "seasons": {"id": 1}},
460
402
  ),
461
403
  ),
462
404
  id="test_emit_state_message_with_stream_and_legacy",
@@ -473,7 +415,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
473
415
  ],
474
416
  "episodes",
475
417
  "public",
476
- True,
477
418
  AirbyteMessage(
478
419
  type=MessageType.STATE,
479
420
  state=AirbyteStateMessage(
@@ -482,7 +423,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
482
423
  stream_descriptor=StreamDescriptor(name="episodes", namespace="public"),
483
424
  stream_state=AirbyteStateBlob(),
484
425
  ),
485
- data={"episodes": {}},
486
426
  ),
487
427
  ),
488
428
  id="test_always_emit_message_with_stream_state_blob",
@@ -499,7 +439,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
499
439
  ],
500
440
  "missing",
501
441
  "public",
502
- True,
503
442
  AirbyteMessage(
504
443
  type=MessageType.STATE,
505
444
  state=AirbyteStateMessage(
@@ -507,7 +446,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
507
446
  stream=AirbyteStreamState(
508
447
  stream_descriptor=StreamDescriptor(name="missing", namespace="public"), stream_state=AirbyteStateBlob()
509
448
  ),
510
- data={"episodes": {"id": 507}},
511
449
  ),
512
450
  ),
513
451
  id="test_emit_state_nonexistent_stream_name",
@@ -524,7 +462,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
524
462
  ],
525
463
  "episodes",
526
464
  "nonexistent",
527
- True,
528
465
  AirbyteMessage(
529
466
  type=MessageType.STATE,
530
467
  state=AirbyteStateMessage(
@@ -532,72 +469,14 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
532
469
  stream=AirbyteStreamState(
533
470
  stream_descriptor=StreamDescriptor(name="episodes", namespace="nonexistent"), stream_state=AirbyteStateBlob()
534
471
  ),
535
- data={"episodes": {"id": 507}},
536
472
  ),
537
473
  ),
538
474
  id="test_emit_state_wrong_namespace",
539
475
  ),
540
- pytest.param(
541
- [
542
- AirbyteStateMessage(
543
- type=AirbyteStateType.STREAM,
544
- stream=AirbyteStreamState(
545
- stream_descriptor=StreamDescriptor(name="episodes", namespace=None),
546
- stream_state=AirbyteStateBlob.parse_obj({"created_at": "2022_05_22"}),
547
- ),
548
- ),
549
- AirbyteStateMessage(
550
- type=AirbyteStateType.STREAM,
551
- stream=AirbyteStreamState(
552
- stream_descriptor=StreamDescriptor(name="seasons", namespace=None),
553
- stream_state=AirbyteStateBlob.parse_obj({"id": 1}),
554
- ),
555
- ),
556
- ],
557
- "episodes",
558
- "",
559
- False,
560
- AirbyteMessage(
561
- type=MessageType.STATE,
562
- state=AirbyteStateMessage(
563
- data={"episodes": {"created_at": "2022_05_22"}, "seasons": {"id": 1}},
564
- ),
565
- ),
566
- id="test_emit_legacy_state_format",
567
- ),
568
476
  ],
569
477
  )
570
- def test_create_state_message(start_state, update_name, update_namespace, send_per_stream, expected_state_message):
478
+ def test_create_state_message(start_state, update_name, update_namespace, expected_state_message):
571
479
  state_manager = ConnectorStateManager({}, start_state)
572
480
 
573
- actual_state_message = state_manager.create_state_message(
574
- stream_name=update_name, namespace=update_namespace, send_per_stream_state=send_per_stream
575
- )
481
+ actual_state_message = state_manager.create_state_message(stream_name=update_name, namespace=update_namespace)
576
482
  assert actual_state_message == expected_state_message
577
-
578
-
579
- def test_do_not_set_stream_descriptor_namespace_when_none():
580
- """
581
- This is a very specific test to ensure that the None value is not set and emitted back to the platform for namespace.
582
- The platform performs validation on the state message sent by the connector and namespace must be a string or not
583
- included at all. The None value registers as null by the platform which is not valid input. We can verify that fields
584
- on a pydantic model are not defined using exclude_unset parameter.
585
- """
586
- expected_stream_state_descriptor = {"name": "episodes"}
587
-
588
- state_manager = ConnectorStateManager(
589
- {},
590
- [
591
- AirbyteStateMessage(
592
- type=AirbyteStateType.STREAM,
593
- stream=AirbyteStreamState(
594
- stream_descriptor=StreamDescriptor(name="episodes"),
595
- stream_state=None,
596
- ),
597
- ),
598
- ],
599
- )
600
-
601
- actual_state_message = state_manager.create_state_message(stream_name="episodes", namespace=None, send_per_stream_state=True)
602
-
603
- assert actual_state_message.state.stream.stream_descriptor.dict(exclude_unset=True) == expected_stream_state_descriptor
@@ -365,8 +365,8 @@ def test_internal_config(abstract_source, catalog):
365
365
  # Test with empty config
366
366
  logger = logging.getLogger(f"airbyte.{getattr(abstract_source, 'name', '')}")
367
367
  records = [r for r in abstract_source.read(logger=logger, config={}, catalog=catalog, state={})]
368
- # 3 for http stream, 3 for non http stream and 3 for stream status messages for each stream (2x)
369
- assert len(records) == 3 + 3 + 3 + 3
368
+ # 3 for http stream, 3 for non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x)
369
+ assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
370
370
  assert http_stream.read_records.called
371
371
  assert non_http_stream.read_records.called
372
372
  # Make sure page_size havent been set
@@ -375,21 +375,21 @@ def test_internal_config(abstract_source, catalog):
375
375
  # Test with records limit set to 1
376
376
  internal_config = {"some_config": 100, "_limit": 1}
377
377
  records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})]
378
- # 1 from http stream + 1 from non http stream and 3 for stream status messages for each stream (2x)
379
- assert len(records) == 1 + 1 + 3 + 3
378
+ # 1 from http stream + 1 from non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x)
379
+ assert len(records) == 1 + 1 + 1 + 1 + 3 + 3
380
380
  assert "_limit" not in abstract_source.streams_config
381
381
  assert "some_config" in abstract_source.streams_config
382
382
  # Test with records limit set to number that exceeds expceted records
383
383
  internal_config = {"some_config": 100, "_limit": 20}
384
384
  records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})]
385
- assert len(records) == 3 + 3 + 3 + 3
385
+ assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
386
386
 
387
387
  # Check if page_size paramter is set to http instance only
388
388
  internal_config = {"some_config": 100, "_page_size": 2}
389
389
  records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})]
390
390
  assert "_page_size" not in abstract_source.streams_config
391
391
  assert "some_config" in abstract_source.streams_config
392
- assert len(records) == 3 + 3 + 3 + 3
392
+ assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
393
393
  assert http_stream.page_size == 2
394
394
  # Make sure page_size havent been set for non http streams
395
395
  assert not non_http_stream.page_size
@@ -403,6 +403,7 @@ def test_internal_config_limit(mocker, abstract_source, catalog):
403
403
  SLICE_DEBUG_LOG_COUNT = 1
404
404
  FULL_RECORDS_NUMBER = 3
405
405
  TRACE_STATUS_COUNT = 3
406
+ STATE_COUNT = 1
406
407
  streams = abstract_source.streams(None)
407
408
  http_stream = streams[0]
408
409
  http_stream.read_records.return_value = [{}] * FULL_RECORDS_NUMBER
@@ -410,7 +411,7 @@ def test_internal_config_limit(mocker, abstract_source, catalog):
410
411
 
411
412
  catalog.streams[0].sync_mode = SyncMode.full_refresh
412
413
  records = [r for r in abstract_source.read(logger=logger_mock, config=internal_config, catalog=catalog, state={})]
413
- assert len(records) == STREAM_LIMIT + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT
414
+ assert len(records) == STREAM_LIMIT + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT
414
415
  logger_info_args = [call[0][0] for call in logger_mock.info.call_args_list]
415
416
  # Check if log line matches number of limit
416
417
  read_log_record = [_l for _l in logger_info_args if _l.startswith("Read")]
@@ -440,6 +441,7 @@ SCHEMA = {"type": "object", "properties": {"value": {"type": "string"}}}
440
441
  def test_source_config_no_transform(mocker, abstract_source, catalog):
441
442
  SLICE_DEBUG_LOG_COUNT = 1
442
443
  TRACE_STATUS_COUNT = 3
444
+ STATE_COUNT = 1
443
445
  logger_mock = mocker.MagicMock()
444
446
  logger_mock.level = logging.DEBUG
445
447
  streams = abstract_source.streams(None)
@@ -447,7 +449,7 @@ def test_source_config_no_transform(mocker, abstract_source, catalog):
447
449
  http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
448
450
  http_stream.read_records.return_value, non_http_stream.read_records.return_value = [[{"value": 23}] * 5] * 2
449
451
  records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
450
- assert len(records) == 2 * (5 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT)
452
+ assert len(records) == 2 * (5 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT)
451
453
  assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": 23}] * 2 * 5
452
454
  assert http_stream.get_json_schema.call_count == 5
453
455
  assert non_http_stream.get_json_schema.call_count == 5
@@ -458,6 +460,7 @@ def test_source_config_transform(mocker, abstract_source, catalog):
458
460
  logger_mock.level = logging.DEBUG
459
461
  SLICE_DEBUG_LOG_COUNT = 2
460
462
  TRACE_STATUS_COUNT = 6
463
+ STATE_COUNT = 2
461
464
  streams = abstract_source.streams(None)
462
465
  http_stream, non_http_stream = streams
463
466
  http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
@@ -465,7 +468,7 @@ def test_source_config_transform(mocker, abstract_source, catalog):
465
468
  http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
466
469
  http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}]
467
470
  records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
468
- assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT
471
+ assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT
469
472
  assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": "23"}] * 2
470
473
 
471
474
 
@@ -474,13 +477,14 @@ def test_source_config_transform_and_no_transform(mocker, abstract_source, catal
474
477
  logger_mock.level = logging.DEBUG
475
478
  SLICE_DEBUG_LOG_COUNT = 2
476
479
  TRACE_STATUS_COUNT = 6
480
+ STATE_COUNT = 2
477
481
  streams = abstract_source.streams(None)
478
482
  http_stream, non_http_stream = streams
479
483
  http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
480
484
  http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
481
485
  http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}]
482
486
  records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
483
- assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT
487
+ assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT
484
488
  assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": "23"}, {"value": 23}]
485
489
 
486
490
 
@@ -526,8 +530,8 @@ def test_read_default_http_availability_strategy_stream_available(catalog, mocke
526
530
  source = MockAbstractSource(streams=streams)
527
531
  logger = logging.getLogger(f"airbyte.{getattr(abstract_source, 'name', '')}")
528
532
  records = [r for r in source.read(logger=logger, config={}, catalog=catalog, state={})]
529
- # 3 for http stream, 3 for non http stream and 3 for stream status messages for each stream (2x)
530
- assert len(records) == 3 + 3 + 3 + 3
533
+ # 3 for http stream, 3 for non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x)
534
+ assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
531
535
  assert http_stream.read_records.called
532
536
  assert non_http_stream.read_records.called
533
537
 
@@ -584,8 +588,8 @@ def test_read_default_http_availability_strategy_stream_unavailable(catalog, moc
584
588
  with caplog.at_level(logging.WARNING):
585
589
  records = [r for r in source.read(logger=logger, config={}, catalog=catalog, state={})]
586
590
 
587
- # 0 for http stream, 3 for non http stream and 3 status trace messages
588
- assert len(records) == 0 + 3 + 3
591
+ # 0 for http stream, 3 for non http stream, 1 for non http stream state message and 3 status trace messages
592
+ assert len(records) == 0 + 3 + 1 + 3
589
593
  assert non_http_stream.read_records.called
590
594
  expected_logs = [
591
595
  f"Skipped syncing stream '{http_stream.name}' because it was unavailable.",