airbyte-cdk 0.67.0__py3-none-any.whl → 0.67.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/abstract_source.py +30 -69
- airbyte_cdk/sources/connector_state_manager.py +12 -26
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +552 -524
- airbyte_cdk/sources/file_based/config/csv_format.py +2 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
- airbyte_cdk/sources/streams/__init__.py +2 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
- airbyte_cdk/sources/streams/core.py +36 -34
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/METADATA +3 -2
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/RECORD +31 -31
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
- unit_tests/sources/file_based/config/test_csv_format.py +6 -1
- unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
- unit_tests/sources/file_based/test_scenarios.py +2 -2
- unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
- unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
- unit_tests/sources/streams/test_stream_read.py +221 -11
- unit_tests/sources/test_abstract_source.py +142 -130
- unit_tests/sources/test_connector_state_manager.py +3 -124
- unit_tests/sources/test_source.py +18 -14
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/top_level.txt +0 -0
@@ -284,61 +284,6 @@ def test_get_stream_state(input_state, stream_name, namespace, expected_state):
|
|
284
284
|
assert actual_state == expected_state
|
285
285
|
|
286
286
|
|
287
|
-
@pytest.mark.parametrize(
|
288
|
-
"input_state, expected_legacy_state, expected_error",
|
289
|
-
[
|
290
|
-
pytest.param(
|
291
|
-
[AirbyteStateMessage(type=AirbyteStateType.LEGACY, data={"actresses": {"id": "seehorn_rhea"}})],
|
292
|
-
{"actresses": {"id": "seehorn_rhea"}},
|
293
|
-
does_not_raise(),
|
294
|
-
id="test_get_legacy_legacy_state_message",
|
295
|
-
),
|
296
|
-
pytest.param(
|
297
|
-
[
|
298
|
-
AirbyteStateMessage(
|
299
|
-
type=AirbyteStateType.STREAM,
|
300
|
-
stream=AirbyteStreamState(
|
301
|
-
stream_descriptor=StreamDescriptor(name="actresses", namespace="public"),
|
302
|
-
stream_state=AirbyteStateBlob.parse_obj({"id": "seehorn_rhea"}),
|
303
|
-
),
|
304
|
-
)
|
305
|
-
],
|
306
|
-
{"actresses": {"id": "seehorn_rhea"}},
|
307
|
-
does_not_raise(),
|
308
|
-
id="test_get_legacy_from_stream_state",
|
309
|
-
),
|
310
|
-
pytest.param(
|
311
|
-
{
|
312
|
-
"actors": {"created_at": "1962-10-22"},
|
313
|
-
"actresses": {"id": "seehorn_rhea"},
|
314
|
-
},
|
315
|
-
{"actors": {"created_at": "1962-10-22"}, "actresses": {"id": "seehorn_rhea"}},
|
316
|
-
does_not_raise(),
|
317
|
-
id="test_get_legacy_from_legacy_state_blob",
|
318
|
-
),
|
319
|
-
pytest.param(
|
320
|
-
[
|
321
|
-
AirbyteStateMessage(
|
322
|
-
type=AirbyteStateType.STREAM,
|
323
|
-
stream=AirbyteStreamState(
|
324
|
-
stream_descriptor=StreamDescriptor(name="actresses", namespace="public"),
|
325
|
-
stream_state=None,
|
326
|
-
),
|
327
|
-
)
|
328
|
-
],
|
329
|
-
{"actresses": {}},
|
330
|
-
does_not_raise(),
|
331
|
-
id="test_get_legacy_from_stream_state",
|
332
|
-
),
|
333
|
-
],
|
334
|
-
)
|
335
|
-
def test_get_legacy_state(input_state, expected_legacy_state, expected_error):
|
336
|
-
with expected_error:
|
337
|
-
state_manager = ConnectorStateManager({}, input_state)
|
338
|
-
actual_legacy_state = state_manager._get_legacy_state()
|
339
|
-
assert actual_legacy_state == expected_legacy_state
|
340
|
-
|
341
|
-
|
342
287
|
def test_get_state_returns_deep_copy():
|
343
288
|
input_state = [
|
344
289
|
AirbyteStateMessage(
|
@@ -422,11 +367,10 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
422
367
|
assert state_manager.per_stream_states[
|
423
368
|
HashableStreamDescriptor(name=update_name, namespace=update_namespace)
|
424
369
|
] == AirbyteStateBlob.parse_obj(update_value)
|
425
|
-
assert state_manager._get_legacy_state() == expected_legacy_state
|
426
370
|
|
427
371
|
|
428
372
|
@pytest.mark.parametrize(
|
429
|
-
"start_state, update_name, update_namespace,
|
373
|
+
"start_state, update_name, update_namespace, expected_state_message",
|
430
374
|
[
|
431
375
|
pytest.param(
|
432
376
|
[
|
@@ -447,7 +391,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
447
391
|
],
|
448
392
|
"episodes",
|
449
393
|
"public",
|
450
|
-
True,
|
451
394
|
AirbyteMessage(
|
452
395
|
type=MessageType.STATE,
|
453
396
|
state=AirbyteStateMessage(
|
@@ -456,7 +399,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
456
399
|
stream_descriptor=StreamDescriptor(name="episodes", namespace="public"),
|
457
400
|
stream_state=AirbyteStateBlob.parse_obj({"created_at": "2022_05_22"}),
|
458
401
|
),
|
459
|
-
data={"episodes": {"created_at": "2022_05_22"}, "seasons": {"id": 1}},
|
460
402
|
),
|
461
403
|
),
|
462
404
|
id="test_emit_state_message_with_stream_and_legacy",
|
@@ -473,7 +415,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
473
415
|
],
|
474
416
|
"episodes",
|
475
417
|
"public",
|
476
|
-
True,
|
477
418
|
AirbyteMessage(
|
478
419
|
type=MessageType.STATE,
|
479
420
|
state=AirbyteStateMessage(
|
@@ -482,7 +423,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
482
423
|
stream_descriptor=StreamDescriptor(name="episodes", namespace="public"),
|
483
424
|
stream_state=AirbyteStateBlob(),
|
484
425
|
),
|
485
|
-
data={"episodes": {}},
|
486
426
|
),
|
487
427
|
),
|
488
428
|
id="test_always_emit_message_with_stream_state_blob",
|
@@ -499,7 +439,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
499
439
|
],
|
500
440
|
"missing",
|
501
441
|
"public",
|
502
|
-
True,
|
503
442
|
AirbyteMessage(
|
504
443
|
type=MessageType.STATE,
|
505
444
|
state=AirbyteStateMessage(
|
@@ -507,7 +446,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
507
446
|
stream=AirbyteStreamState(
|
508
447
|
stream_descriptor=StreamDescriptor(name="missing", namespace="public"), stream_state=AirbyteStateBlob()
|
509
448
|
),
|
510
|
-
data={"episodes": {"id": 507}},
|
511
449
|
),
|
512
450
|
),
|
513
451
|
id="test_emit_state_nonexistent_stream_name",
|
@@ -524,7 +462,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
524
462
|
],
|
525
463
|
"episodes",
|
526
464
|
"nonexistent",
|
527
|
-
True,
|
528
465
|
AirbyteMessage(
|
529
466
|
type=MessageType.STATE,
|
530
467
|
state=AirbyteStateMessage(
|
@@ -532,72 +469,14 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
532
469
|
stream=AirbyteStreamState(
|
533
470
|
stream_descriptor=StreamDescriptor(name="episodes", namespace="nonexistent"), stream_state=AirbyteStateBlob()
|
534
471
|
),
|
535
|
-
data={"episodes": {"id": 507}},
|
536
472
|
),
|
537
473
|
),
|
538
474
|
id="test_emit_state_wrong_namespace",
|
539
475
|
),
|
540
|
-
pytest.param(
|
541
|
-
[
|
542
|
-
AirbyteStateMessage(
|
543
|
-
type=AirbyteStateType.STREAM,
|
544
|
-
stream=AirbyteStreamState(
|
545
|
-
stream_descriptor=StreamDescriptor(name="episodes", namespace=None),
|
546
|
-
stream_state=AirbyteStateBlob.parse_obj({"created_at": "2022_05_22"}),
|
547
|
-
),
|
548
|
-
),
|
549
|
-
AirbyteStateMessage(
|
550
|
-
type=AirbyteStateType.STREAM,
|
551
|
-
stream=AirbyteStreamState(
|
552
|
-
stream_descriptor=StreamDescriptor(name="seasons", namespace=None),
|
553
|
-
stream_state=AirbyteStateBlob.parse_obj({"id": 1}),
|
554
|
-
),
|
555
|
-
),
|
556
|
-
],
|
557
|
-
"episodes",
|
558
|
-
"",
|
559
|
-
False,
|
560
|
-
AirbyteMessage(
|
561
|
-
type=MessageType.STATE,
|
562
|
-
state=AirbyteStateMessage(
|
563
|
-
data={"episodes": {"created_at": "2022_05_22"}, "seasons": {"id": 1}},
|
564
|
-
),
|
565
|
-
),
|
566
|
-
id="test_emit_legacy_state_format",
|
567
|
-
),
|
568
476
|
],
|
569
477
|
)
|
570
|
-
def test_create_state_message(start_state, update_name, update_namespace,
|
478
|
+
def test_create_state_message(start_state, update_name, update_namespace, expected_state_message):
|
571
479
|
state_manager = ConnectorStateManager({}, start_state)
|
572
480
|
|
573
|
-
actual_state_message = state_manager.create_state_message(
|
574
|
-
stream_name=update_name, namespace=update_namespace, send_per_stream_state=send_per_stream
|
575
|
-
)
|
481
|
+
actual_state_message = state_manager.create_state_message(stream_name=update_name, namespace=update_namespace)
|
576
482
|
assert actual_state_message == expected_state_message
|
577
|
-
|
578
|
-
|
579
|
-
def test_do_not_set_stream_descriptor_namespace_when_none():
|
580
|
-
"""
|
581
|
-
This is a very specific test to ensure that the None value is not set and emitted back to the platform for namespace.
|
582
|
-
The platform performs validation on the state message sent by the connector and namespace must be a string or not
|
583
|
-
included at all. The None value registers as null by the platform which is not valid input. We can verify that fields
|
584
|
-
on a pydantic model are not defined using exclude_unset parameter.
|
585
|
-
"""
|
586
|
-
expected_stream_state_descriptor = {"name": "episodes"}
|
587
|
-
|
588
|
-
state_manager = ConnectorStateManager(
|
589
|
-
{},
|
590
|
-
[
|
591
|
-
AirbyteStateMessage(
|
592
|
-
type=AirbyteStateType.STREAM,
|
593
|
-
stream=AirbyteStreamState(
|
594
|
-
stream_descriptor=StreamDescriptor(name="episodes"),
|
595
|
-
stream_state=None,
|
596
|
-
),
|
597
|
-
),
|
598
|
-
],
|
599
|
-
)
|
600
|
-
|
601
|
-
actual_state_message = state_manager.create_state_message(stream_name="episodes", namespace=None, send_per_stream_state=True)
|
602
|
-
|
603
|
-
assert actual_state_message.state.stream.stream_descriptor.dict(exclude_unset=True) == expected_stream_state_descriptor
|
@@ -365,8 +365,8 @@ def test_internal_config(abstract_source, catalog):
|
|
365
365
|
# Test with empty config
|
366
366
|
logger = logging.getLogger(f"airbyte.{getattr(abstract_source, 'name', '')}")
|
367
367
|
records = [r for r in abstract_source.read(logger=logger, config={}, catalog=catalog, state={})]
|
368
|
-
# 3 for http stream, 3 for non http stream and 3 for stream status messages for each stream (2x)
|
369
|
-
assert len(records) == 3 + 3 + 3 + 3
|
368
|
+
# 3 for http stream, 3 for non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x)
|
369
|
+
assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
|
370
370
|
assert http_stream.read_records.called
|
371
371
|
assert non_http_stream.read_records.called
|
372
372
|
# Make sure page_size havent been set
|
@@ -375,21 +375,21 @@ def test_internal_config(abstract_source, catalog):
|
|
375
375
|
# Test with records limit set to 1
|
376
376
|
internal_config = {"some_config": 100, "_limit": 1}
|
377
377
|
records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})]
|
378
|
-
# 1 from http stream + 1 from non http stream and 3 for stream status messages for each stream (2x)
|
379
|
-
assert len(records) == 1 + 1 + 3 + 3
|
378
|
+
# 1 from http stream + 1 from non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x)
|
379
|
+
assert len(records) == 1 + 1 + 1 + 1 + 3 + 3
|
380
380
|
assert "_limit" not in abstract_source.streams_config
|
381
381
|
assert "some_config" in abstract_source.streams_config
|
382
382
|
# Test with records limit set to number that exceeds expceted records
|
383
383
|
internal_config = {"some_config": 100, "_limit": 20}
|
384
384
|
records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})]
|
385
|
-
assert len(records) == 3 + 3 + 3 + 3
|
385
|
+
assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
|
386
386
|
|
387
387
|
# Check if page_size paramter is set to http instance only
|
388
388
|
internal_config = {"some_config": 100, "_page_size": 2}
|
389
389
|
records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})]
|
390
390
|
assert "_page_size" not in abstract_source.streams_config
|
391
391
|
assert "some_config" in abstract_source.streams_config
|
392
|
-
assert len(records) == 3 + 3 + 3 + 3
|
392
|
+
assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
|
393
393
|
assert http_stream.page_size == 2
|
394
394
|
# Make sure page_size havent been set for non http streams
|
395
395
|
assert not non_http_stream.page_size
|
@@ -403,6 +403,7 @@ def test_internal_config_limit(mocker, abstract_source, catalog):
|
|
403
403
|
SLICE_DEBUG_LOG_COUNT = 1
|
404
404
|
FULL_RECORDS_NUMBER = 3
|
405
405
|
TRACE_STATUS_COUNT = 3
|
406
|
+
STATE_COUNT = 1
|
406
407
|
streams = abstract_source.streams(None)
|
407
408
|
http_stream = streams[0]
|
408
409
|
http_stream.read_records.return_value = [{}] * FULL_RECORDS_NUMBER
|
@@ -410,7 +411,7 @@ def test_internal_config_limit(mocker, abstract_source, catalog):
|
|
410
411
|
|
411
412
|
catalog.streams[0].sync_mode = SyncMode.full_refresh
|
412
413
|
records = [r for r in abstract_source.read(logger=logger_mock, config=internal_config, catalog=catalog, state={})]
|
413
|
-
assert len(records) == STREAM_LIMIT + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT
|
414
|
+
assert len(records) == STREAM_LIMIT + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT
|
414
415
|
logger_info_args = [call[0][0] for call in logger_mock.info.call_args_list]
|
415
416
|
# Check if log line matches number of limit
|
416
417
|
read_log_record = [_l for _l in logger_info_args if _l.startswith("Read")]
|
@@ -440,6 +441,7 @@ SCHEMA = {"type": "object", "properties": {"value": {"type": "string"}}}
|
|
440
441
|
def test_source_config_no_transform(mocker, abstract_source, catalog):
|
441
442
|
SLICE_DEBUG_LOG_COUNT = 1
|
442
443
|
TRACE_STATUS_COUNT = 3
|
444
|
+
STATE_COUNT = 1
|
443
445
|
logger_mock = mocker.MagicMock()
|
444
446
|
logger_mock.level = logging.DEBUG
|
445
447
|
streams = abstract_source.streams(None)
|
@@ -447,7 +449,7 @@ def test_source_config_no_transform(mocker, abstract_source, catalog):
|
|
447
449
|
http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
|
448
450
|
http_stream.read_records.return_value, non_http_stream.read_records.return_value = [[{"value": 23}] * 5] * 2
|
449
451
|
records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
|
450
|
-
assert len(records) == 2 * (5 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT)
|
452
|
+
assert len(records) == 2 * (5 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT)
|
451
453
|
assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": 23}] * 2 * 5
|
452
454
|
assert http_stream.get_json_schema.call_count == 5
|
453
455
|
assert non_http_stream.get_json_schema.call_count == 5
|
@@ -458,6 +460,7 @@ def test_source_config_transform(mocker, abstract_source, catalog):
|
|
458
460
|
logger_mock.level = logging.DEBUG
|
459
461
|
SLICE_DEBUG_LOG_COUNT = 2
|
460
462
|
TRACE_STATUS_COUNT = 6
|
463
|
+
STATE_COUNT = 2
|
461
464
|
streams = abstract_source.streams(None)
|
462
465
|
http_stream, non_http_stream = streams
|
463
466
|
http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
|
@@ -465,7 +468,7 @@ def test_source_config_transform(mocker, abstract_source, catalog):
|
|
465
468
|
http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
|
466
469
|
http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}]
|
467
470
|
records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
|
468
|
-
assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT
|
471
|
+
assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT
|
469
472
|
assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": "23"}] * 2
|
470
473
|
|
471
474
|
|
@@ -474,13 +477,14 @@ def test_source_config_transform_and_no_transform(mocker, abstract_source, catal
|
|
474
477
|
logger_mock.level = logging.DEBUG
|
475
478
|
SLICE_DEBUG_LOG_COUNT = 2
|
476
479
|
TRACE_STATUS_COUNT = 6
|
480
|
+
STATE_COUNT = 2
|
477
481
|
streams = abstract_source.streams(None)
|
478
482
|
http_stream, non_http_stream = streams
|
479
483
|
http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
|
480
484
|
http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
|
481
485
|
http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}]
|
482
486
|
records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
|
483
|
-
assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT
|
487
|
+
assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT
|
484
488
|
assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": "23"}, {"value": 23}]
|
485
489
|
|
486
490
|
|
@@ -526,8 +530,8 @@ def test_read_default_http_availability_strategy_stream_available(catalog, mocke
|
|
526
530
|
source = MockAbstractSource(streams=streams)
|
527
531
|
logger = logging.getLogger(f"airbyte.{getattr(abstract_source, 'name', '')}")
|
528
532
|
records = [r for r in source.read(logger=logger, config={}, catalog=catalog, state={})]
|
529
|
-
# 3 for http stream, 3 for non http stream and 3 for stream status messages for each stream (2x)
|
530
|
-
assert len(records) == 3 + 3 + 3 + 3
|
533
|
+
# 3 for http stream, 3 for non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x)
|
534
|
+
assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
|
531
535
|
assert http_stream.read_records.called
|
532
536
|
assert non_http_stream.read_records.called
|
533
537
|
|
@@ -584,8 +588,8 @@ def test_read_default_http_availability_strategy_stream_unavailable(catalog, moc
|
|
584
588
|
with caplog.at_level(logging.WARNING):
|
585
589
|
records = [r for r in source.read(logger=logger, config={}, catalog=catalog, state={})]
|
586
590
|
|
587
|
-
# 0 for http stream, 3 for non http stream and 3 status trace messages
|
588
|
-
assert len(records) == 0 + 3 + 3
|
591
|
+
# 0 for http stream, 3 for non http stream, 1 for non http stream state message and 3 status trace messages
|
592
|
+
assert len(records) == 0 + 3 + 1 + 3
|
589
593
|
assert non_http_stream.read_records.called
|
590
594
|
expected_logs = [
|
591
595
|
f"Skipped syncing stream '{http_stream.name}' because it was unavailable.",
|
File without changes
|
File without changes
|
File without changes
|