airbyte-cdk 0.67.1__py3-none-any.whl → 0.67.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/sources/abstract_source.py +30 -69
- airbyte_cdk/sources/connector_state_manager.py +12 -26
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
- airbyte_cdk/sources/streams/__init__.py +2 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
- airbyte_cdk/sources/streams/core.py +36 -34
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/RECORD +28 -28
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
- unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
- unit_tests/sources/file_based/test_scenarios.py +2 -2
- unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
- unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
- unit_tests/sources/streams/test_stream_read.py +221 -11
- unit_tests/sources/test_abstract_source.py +142 -130
- unit_tests/sources/test_connector_state_manager.py +3 -124
- unit_tests/sources/test_source.py +18 -14
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/top_level.txt +0 -0
@@ -284,61 +284,6 @@ def test_get_stream_state(input_state, stream_name, namespace, expected_state):
|
|
284
284
|
assert actual_state == expected_state
|
285
285
|
|
286
286
|
|
287
|
-
@pytest.mark.parametrize(
|
288
|
-
"input_state, expected_legacy_state, expected_error",
|
289
|
-
[
|
290
|
-
pytest.param(
|
291
|
-
[AirbyteStateMessage(type=AirbyteStateType.LEGACY, data={"actresses": {"id": "seehorn_rhea"}})],
|
292
|
-
{"actresses": {"id": "seehorn_rhea"}},
|
293
|
-
does_not_raise(),
|
294
|
-
id="test_get_legacy_legacy_state_message",
|
295
|
-
),
|
296
|
-
pytest.param(
|
297
|
-
[
|
298
|
-
AirbyteStateMessage(
|
299
|
-
type=AirbyteStateType.STREAM,
|
300
|
-
stream=AirbyteStreamState(
|
301
|
-
stream_descriptor=StreamDescriptor(name="actresses", namespace="public"),
|
302
|
-
stream_state=AirbyteStateBlob.parse_obj({"id": "seehorn_rhea"}),
|
303
|
-
),
|
304
|
-
)
|
305
|
-
],
|
306
|
-
{"actresses": {"id": "seehorn_rhea"}},
|
307
|
-
does_not_raise(),
|
308
|
-
id="test_get_legacy_from_stream_state",
|
309
|
-
),
|
310
|
-
pytest.param(
|
311
|
-
{
|
312
|
-
"actors": {"created_at": "1962-10-22"},
|
313
|
-
"actresses": {"id": "seehorn_rhea"},
|
314
|
-
},
|
315
|
-
{"actors": {"created_at": "1962-10-22"}, "actresses": {"id": "seehorn_rhea"}},
|
316
|
-
does_not_raise(),
|
317
|
-
id="test_get_legacy_from_legacy_state_blob",
|
318
|
-
),
|
319
|
-
pytest.param(
|
320
|
-
[
|
321
|
-
AirbyteStateMessage(
|
322
|
-
type=AirbyteStateType.STREAM,
|
323
|
-
stream=AirbyteStreamState(
|
324
|
-
stream_descriptor=StreamDescriptor(name="actresses", namespace="public"),
|
325
|
-
stream_state=None,
|
326
|
-
),
|
327
|
-
)
|
328
|
-
],
|
329
|
-
{"actresses": {}},
|
330
|
-
does_not_raise(),
|
331
|
-
id="test_get_legacy_from_stream_state",
|
332
|
-
),
|
333
|
-
],
|
334
|
-
)
|
335
|
-
def test_get_legacy_state(input_state, expected_legacy_state, expected_error):
|
336
|
-
with expected_error:
|
337
|
-
state_manager = ConnectorStateManager({}, input_state)
|
338
|
-
actual_legacy_state = state_manager._get_legacy_state()
|
339
|
-
assert actual_legacy_state == expected_legacy_state
|
340
|
-
|
341
|
-
|
342
287
|
def test_get_state_returns_deep_copy():
|
343
288
|
input_state = [
|
344
289
|
AirbyteStateMessage(
|
@@ -422,11 +367,10 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
422
367
|
assert state_manager.per_stream_states[
|
423
368
|
HashableStreamDescriptor(name=update_name, namespace=update_namespace)
|
424
369
|
] == AirbyteStateBlob.parse_obj(update_value)
|
425
|
-
assert state_manager._get_legacy_state() == expected_legacy_state
|
426
370
|
|
427
371
|
|
428
372
|
@pytest.mark.parametrize(
|
429
|
-
"start_state, update_name, update_namespace,
|
373
|
+
"start_state, update_name, update_namespace, expected_state_message",
|
430
374
|
[
|
431
375
|
pytest.param(
|
432
376
|
[
|
@@ -447,7 +391,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
447
391
|
],
|
448
392
|
"episodes",
|
449
393
|
"public",
|
450
|
-
True,
|
451
394
|
AirbyteMessage(
|
452
395
|
type=MessageType.STATE,
|
453
396
|
state=AirbyteStateMessage(
|
@@ -456,7 +399,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
456
399
|
stream_descriptor=StreamDescriptor(name="episodes", namespace="public"),
|
457
400
|
stream_state=AirbyteStateBlob.parse_obj({"created_at": "2022_05_22"}),
|
458
401
|
),
|
459
|
-
data={"episodes": {"created_at": "2022_05_22"}, "seasons": {"id": 1}},
|
460
402
|
),
|
461
403
|
),
|
462
404
|
id="test_emit_state_message_with_stream_and_legacy",
|
@@ -473,7 +415,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
473
415
|
],
|
474
416
|
"episodes",
|
475
417
|
"public",
|
476
|
-
True,
|
477
418
|
AirbyteMessage(
|
478
419
|
type=MessageType.STATE,
|
479
420
|
state=AirbyteStateMessage(
|
@@ -482,7 +423,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
482
423
|
stream_descriptor=StreamDescriptor(name="episodes", namespace="public"),
|
483
424
|
stream_state=AirbyteStateBlob(),
|
484
425
|
),
|
485
|
-
data={"episodes": {}},
|
486
426
|
),
|
487
427
|
),
|
488
428
|
id="test_always_emit_message_with_stream_state_blob",
|
@@ -499,7 +439,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
499
439
|
],
|
500
440
|
"missing",
|
501
441
|
"public",
|
502
|
-
True,
|
503
442
|
AirbyteMessage(
|
504
443
|
type=MessageType.STATE,
|
505
444
|
state=AirbyteStateMessage(
|
@@ -507,7 +446,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
507
446
|
stream=AirbyteStreamState(
|
508
447
|
stream_descriptor=StreamDescriptor(name="missing", namespace="public"), stream_state=AirbyteStateBlob()
|
509
448
|
),
|
510
|
-
data={"episodes": {"id": 507}},
|
511
449
|
),
|
512
450
|
),
|
513
451
|
id="test_emit_state_nonexistent_stream_name",
|
@@ -524,7 +462,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
524
462
|
],
|
525
463
|
"episodes",
|
526
464
|
"nonexistent",
|
527
|
-
True,
|
528
465
|
AirbyteMessage(
|
529
466
|
type=MessageType.STATE,
|
530
467
|
state=AirbyteStateMessage(
|
@@ -532,72 +469,14 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
|
|
532
469
|
stream=AirbyteStreamState(
|
533
470
|
stream_descriptor=StreamDescriptor(name="episodes", namespace="nonexistent"), stream_state=AirbyteStateBlob()
|
534
471
|
),
|
535
|
-
data={"episodes": {"id": 507}},
|
536
472
|
),
|
537
473
|
),
|
538
474
|
id="test_emit_state_wrong_namespace",
|
539
475
|
),
|
540
|
-
pytest.param(
|
541
|
-
[
|
542
|
-
AirbyteStateMessage(
|
543
|
-
type=AirbyteStateType.STREAM,
|
544
|
-
stream=AirbyteStreamState(
|
545
|
-
stream_descriptor=StreamDescriptor(name="episodes", namespace=None),
|
546
|
-
stream_state=AirbyteStateBlob.parse_obj({"created_at": "2022_05_22"}),
|
547
|
-
),
|
548
|
-
),
|
549
|
-
AirbyteStateMessage(
|
550
|
-
type=AirbyteStateType.STREAM,
|
551
|
-
stream=AirbyteStreamState(
|
552
|
-
stream_descriptor=StreamDescriptor(name="seasons", namespace=None),
|
553
|
-
stream_state=AirbyteStateBlob.parse_obj({"id": 1}),
|
554
|
-
),
|
555
|
-
),
|
556
|
-
],
|
557
|
-
"episodes",
|
558
|
-
"",
|
559
|
-
False,
|
560
|
-
AirbyteMessage(
|
561
|
-
type=MessageType.STATE,
|
562
|
-
state=AirbyteStateMessage(
|
563
|
-
data={"episodes": {"created_at": "2022_05_22"}, "seasons": {"id": 1}},
|
564
|
-
),
|
565
|
-
),
|
566
|
-
id="test_emit_legacy_state_format",
|
567
|
-
),
|
568
476
|
],
|
569
477
|
)
|
570
|
-
def test_create_state_message(start_state, update_name, update_namespace,
|
478
|
+
def test_create_state_message(start_state, update_name, update_namespace, expected_state_message):
|
571
479
|
state_manager = ConnectorStateManager({}, start_state)
|
572
480
|
|
573
|
-
actual_state_message = state_manager.create_state_message(
|
574
|
-
stream_name=update_name, namespace=update_namespace, send_per_stream_state=send_per_stream
|
575
|
-
)
|
481
|
+
actual_state_message = state_manager.create_state_message(stream_name=update_name, namespace=update_namespace)
|
576
482
|
assert actual_state_message == expected_state_message
|
577
|
-
|
578
|
-
|
579
|
-
def test_do_not_set_stream_descriptor_namespace_when_none():
|
580
|
-
"""
|
581
|
-
This is a very specific test to ensure that the None value is not set and emitted back to the platform for namespace.
|
582
|
-
The platform performs validation on the state message sent by the connector and namespace must be a string or not
|
583
|
-
included at all. The None value registers as null by the platform which is not valid input. We can verify that fields
|
584
|
-
on a pydantic model are not defined using exclude_unset parameter.
|
585
|
-
"""
|
586
|
-
expected_stream_state_descriptor = {"name": "episodes"}
|
587
|
-
|
588
|
-
state_manager = ConnectorStateManager(
|
589
|
-
{},
|
590
|
-
[
|
591
|
-
AirbyteStateMessage(
|
592
|
-
type=AirbyteStateType.STREAM,
|
593
|
-
stream=AirbyteStreamState(
|
594
|
-
stream_descriptor=StreamDescriptor(name="episodes"),
|
595
|
-
stream_state=None,
|
596
|
-
),
|
597
|
-
),
|
598
|
-
],
|
599
|
-
)
|
600
|
-
|
601
|
-
actual_state_message = state_manager.create_state_message(stream_name="episodes", namespace=None, send_per_stream_state=True)
|
602
|
-
|
603
|
-
assert actual_state_message.state.stream.stream_descriptor.dict(exclude_unset=True) == expected_stream_state_descriptor
|
@@ -365,8 +365,8 @@ def test_internal_config(abstract_source, catalog):
|
|
365
365
|
# Test with empty config
|
366
366
|
logger = logging.getLogger(f"airbyte.{getattr(abstract_source, 'name', '')}")
|
367
367
|
records = [r for r in abstract_source.read(logger=logger, config={}, catalog=catalog, state={})]
|
368
|
-
# 3 for http stream, 3 for non http stream and 3 for stream status messages for each stream (2x)
|
369
|
-
assert len(records) == 3 + 3 + 3 + 3
|
368
|
+
# 3 for http stream, 3 for non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x)
|
369
|
+
assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
|
370
370
|
assert http_stream.read_records.called
|
371
371
|
assert non_http_stream.read_records.called
|
372
372
|
# Make sure page_size havent been set
|
@@ -375,21 +375,21 @@ def test_internal_config(abstract_source, catalog):
|
|
375
375
|
# Test with records limit set to 1
|
376
376
|
internal_config = {"some_config": 100, "_limit": 1}
|
377
377
|
records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})]
|
378
|
-
# 1 from http stream + 1 from non http stream and 3 for stream status messages for each stream (2x)
|
379
|
-
assert len(records) == 1 + 1 + 3 + 3
|
378
|
+
# 1 from http stream + 1 from non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x)
|
379
|
+
assert len(records) == 1 + 1 + 1 + 1 + 3 + 3
|
380
380
|
assert "_limit" not in abstract_source.streams_config
|
381
381
|
assert "some_config" in abstract_source.streams_config
|
382
382
|
# Test with records limit set to number that exceeds expceted records
|
383
383
|
internal_config = {"some_config": 100, "_limit": 20}
|
384
384
|
records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})]
|
385
|
-
assert len(records) == 3 + 3 + 3 + 3
|
385
|
+
assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
|
386
386
|
|
387
387
|
# Check if page_size paramter is set to http instance only
|
388
388
|
internal_config = {"some_config": 100, "_page_size": 2}
|
389
389
|
records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})]
|
390
390
|
assert "_page_size" not in abstract_source.streams_config
|
391
391
|
assert "some_config" in abstract_source.streams_config
|
392
|
-
assert len(records) == 3 + 3 + 3 + 3
|
392
|
+
assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
|
393
393
|
assert http_stream.page_size == 2
|
394
394
|
# Make sure page_size havent been set for non http streams
|
395
395
|
assert not non_http_stream.page_size
|
@@ -403,6 +403,7 @@ def test_internal_config_limit(mocker, abstract_source, catalog):
|
|
403
403
|
SLICE_DEBUG_LOG_COUNT = 1
|
404
404
|
FULL_RECORDS_NUMBER = 3
|
405
405
|
TRACE_STATUS_COUNT = 3
|
406
|
+
STATE_COUNT = 1
|
406
407
|
streams = abstract_source.streams(None)
|
407
408
|
http_stream = streams[0]
|
408
409
|
http_stream.read_records.return_value = [{}] * FULL_RECORDS_NUMBER
|
@@ -410,7 +411,7 @@ def test_internal_config_limit(mocker, abstract_source, catalog):
|
|
410
411
|
|
411
412
|
catalog.streams[0].sync_mode = SyncMode.full_refresh
|
412
413
|
records = [r for r in abstract_source.read(logger=logger_mock, config=internal_config, catalog=catalog, state={})]
|
413
|
-
assert len(records) == STREAM_LIMIT + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT
|
414
|
+
assert len(records) == STREAM_LIMIT + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT
|
414
415
|
logger_info_args = [call[0][0] for call in logger_mock.info.call_args_list]
|
415
416
|
# Check if log line matches number of limit
|
416
417
|
read_log_record = [_l for _l in logger_info_args if _l.startswith("Read")]
|
@@ -440,6 +441,7 @@ SCHEMA = {"type": "object", "properties": {"value": {"type": "string"}}}
|
|
440
441
|
def test_source_config_no_transform(mocker, abstract_source, catalog):
|
441
442
|
SLICE_DEBUG_LOG_COUNT = 1
|
442
443
|
TRACE_STATUS_COUNT = 3
|
444
|
+
STATE_COUNT = 1
|
443
445
|
logger_mock = mocker.MagicMock()
|
444
446
|
logger_mock.level = logging.DEBUG
|
445
447
|
streams = abstract_source.streams(None)
|
@@ -447,7 +449,7 @@ def test_source_config_no_transform(mocker, abstract_source, catalog):
|
|
447
449
|
http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
|
448
450
|
http_stream.read_records.return_value, non_http_stream.read_records.return_value = [[{"value": 23}] * 5] * 2
|
449
451
|
records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
|
450
|
-
assert len(records) == 2 * (5 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT)
|
452
|
+
assert len(records) == 2 * (5 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT)
|
451
453
|
assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": 23}] * 2 * 5
|
452
454
|
assert http_stream.get_json_schema.call_count == 5
|
453
455
|
assert non_http_stream.get_json_schema.call_count == 5
|
@@ -458,6 +460,7 @@ def test_source_config_transform(mocker, abstract_source, catalog):
|
|
458
460
|
logger_mock.level = logging.DEBUG
|
459
461
|
SLICE_DEBUG_LOG_COUNT = 2
|
460
462
|
TRACE_STATUS_COUNT = 6
|
463
|
+
STATE_COUNT = 2
|
461
464
|
streams = abstract_source.streams(None)
|
462
465
|
http_stream, non_http_stream = streams
|
463
466
|
http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
|
@@ -465,7 +468,7 @@ def test_source_config_transform(mocker, abstract_source, catalog):
|
|
465
468
|
http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
|
466
469
|
http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}]
|
467
470
|
records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
|
468
|
-
assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT
|
471
|
+
assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT
|
469
472
|
assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": "23"}] * 2
|
470
473
|
|
471
474
|
|
@@ -474,13 +477,14 @@ def test_source_config_transform_and_no_transform(mocker, abstract_source, catal
|
|
474
477
|
logger_mock.level = logging.DEBUG
|
475
478
|
SLICE_DEBUG_LOG_COUNT = 2
|
476
479
|
TRACE_STATUS_COUNT = 6
|
480
|
+
STATE_COUNT = 2
|
477
481
|
streams = abstract_source.streams(None)
|
478
482
|
http_stream, non_http_stream = streams
|
479
483
|
http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
|
480
484
|
http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
|
481
485
|
http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}]
|
482
486
|
records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
|
483
|
-
assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT
|
487
|
+
assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT
|
484
488
|
assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": "23"}, {"value": 23}]
|
485
489
|
|
486
490
|
|
@@ -526,8 +530,8 @@ def test_read_default_http_availability_strategy_stream_available(catalog, mocke
|
|
526
530
|
source = MockAbstractSource(streams=streams)
|
527
531
|
logger = logging.getLogger(f"airbyte.{getattr(abstract_source, 'name', '')}")
|
528
532
|
records = [r for r in source.read(logger=logger, config={}, catalog=catalog, state={})]
|
529
|
-
# 3 for http stream, 3 for non http stream and 3 for stream status messages for each stream (2x)
|
530
|
-
assert len(records) == 3 + 3 + 3 + 3
|
533
|
+
# 3 for http stream, 3 for non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x)
|
534
|
+
assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
|
531
535
|
assert http_stream.read_records.called
|
532
536
|
assert non_http_stream.read_records.called
|
533
537
|
|
@@ -584,8 +588,8 @@ def test_read_default_http_availability_strategy_stream_unavailable(catalog, moc
|
|
584
588
|
with caplog.at_level(logging.WARNING):
|
585
589
|
records = [r for r in source.read(logger=logger, config={}, catalog=catalog, state={})]
|
586
590
|
|
587
|
-
# 0 for http stream, 3 for non http stream and 3 status trace messages
|
588
|
-
assert len(records) == 0 + 3 + 3
|
591
|
+
# 0 for http stream, 3 for non http stream, 1 for non http stream state message and 3 status trace messages
|
592
|
+
assert len(records) == 0 + 3 + 1 + 3
|
589
593
|
assert non_http_stream.read_records.called
|
590
594
|
expected_logs = [
|
591
595
|
f"Skipped syncing stream '{http_stream.name}' because it was unavailable.",
|
File without changes
|
File without changes
|
File without changes
|