airbyte-cdk 0.67.1__py3-none-any.whl → 0.67.3__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (28) hide show
  1. airbyte_cdk/sources/abstract_source.py +30 -69
  2. airbyte_cdk/sources/connector_state_manager.py +12 -26
  3. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
  4. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
  5. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
  6. airbyte_cdk/sources/streams/__init__.py +2 -2
  7. airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
  8. airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
  9. airbyte_cdk/sources/streams/core.py +36 -34
  10. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/METADATA +3 -3
  11. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/RECORD +28 -28
  12. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
  13. unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
  14. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
  15. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
  16. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
  17. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
  18. unit_tests/sources/file_based/test_scenarios.py +2 -2
  19. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
  20. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
  21. unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
  22. unit_tests/sources/streams/test_stream_read.py +221 -11
  23. unit_tests/sources/test_abstract_source.py +142 -130
  24. unit_tests/sources/test_connector_state_manager.py +3 -124
  25. unit_tests/sources/test_source.py +18 -14
  26. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/LICENSE.txt +0 -0
  27. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/WHEEL +0 -0
  28. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/top_level.txt +0 -0
@@ -284,61 +284,6 @@ def test_get_stream_state(input_state, stream_name, namespace, expected_state):
284
284
  assert actual_state == expected_state
285
285
 
286
286
 
287
- @pytest.mark.parametrize(
288
- "input_state, expected_legacy_state, expected_error",
289
- [
290
- pytest.param(
291
- [AirbyteStateMessage(type=AirbyteStateType.LEGACY, data={"actresses": {"id": "seehorn_rhea"}})],
292
- {"actresses": {"id": "seehorn_rhea"}},
293
- does_not_raise(),
294
- id="test_get_legacy_legacy_state_message",
295
- ),
296
- pytest.param(
297
- [
298
- AirbyteStateMessage(
299
- type=AirbyteStateType.STREAM,
300
- stream=AirbyteStreamState(
301
- stream_descriptor=StreamDescriptor(name="actresses", namespace="public"),
302
- stream_state=AirbyteStateBlob.parse_obj({"id": "seehorn_rhea"}),
303
- ),
304
- )
305
- ],
306
- {"actresses": {"id": "seehorn_rhea"}},
307
- does_not_raise(),
308
- id="test_get_legacy_from_stream_state",
309
- ),
310
- pytest.param(
311
- {
312
- "actors": {"created_at": "1962-10-22"},
313
- "actresses": {"id": "seehorn_rhea"},
314
- },
315
- {"actors": {"created_at": "1962-10-22"}, "actresses": {"id": "seehorn_rhea"}},
316
- does_not_raise(),
317
- id="test_get_legacy_from_legacy_state_blob",
318
- ),
319
- pytest.param(
320
- [
321
- AirbyteStateMessage(
322
- type=AirbyteStateType.STREAM,
323
- stream=AirbyteStreamState(
324
- stream_descriptor=StreamDescriptor(name="actresses", namespace="public"),
325
- stream_state=None,
326
- ),
327
- )
328
- ],
329
- {"actresses": {}},
330
- does_not_raise(),
331
- id="test_get_legacy_from_stream_state",
332
- ),
333
- ],
334
- )
335
- def test_get_legacy_state(input_state, expected_legacy_state, expected_error):
336
- with expected_error:
337
- state_manager = ConnectorStateManager({}, input_state)
338
- actual_legacy_state = state_manager._get_legacy_state()
339
- assert actual_legacy_state == expected_legacy_state
340
-
341
-
342
287
  def test_get_state_returns_deep_copy():
343
288
  input_state = [
344
289
  AirbyteStateMessage(
@@ -422,11 +367,10 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
422
367
  assert state_manager.per_stream_states[
423
368
  HashableStreamDescriptor(name=update_name, namespace=update_namespace)
424
369
  ] == AirbyteStateBlob.parse_obj(update_value)
425
- assert state_manager._get_legacy_state() == expected_legacy_state
426
370
 
427
371
 
428
372
  @pytest.mark.parametrize(
429
- "start_state, update_name, update_namespace, send_per_stream, expected_state_message",
373
+ "start_state, update_name, update_namespace, expected_state_message",
430
374
  [
431
375
  pytest.param(
432
376
  [
@@ -447,7 +391,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
447
391
  ],
448
392
  "episodes",
449
393
  "public",
450
- True,
451
394
  AirbyteMessage(
452
395
  type=MessageType.STATE,
453
396
  state=AirbyteStateMessage(
@@ -456,7 +399,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
456
399
  stream_descriptor=StreamDescriptor(name="episodes", namespace="public"),
457
400
  stream_state=AirbyteStateBlob.parse_obj({"created_at": "2022_05_22"}),
458
401
  ),
459
- data={"episodes": {"created_at": "2022_05_22"}, "seasons": {"id": 1}},
460
402
  ),
461
403
  ),
462
404
  id="test_emit_state_message_with_stream_and_legacy",
@@ -473,7 +415,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
473
415
  ],
474
416
  "episodes",
475
417
  "public",
476
- True,
477
418
  AirbyteMessage(
478
419
  type=MessageType.STATE,
479
420
  state=AirbyteStateMessage(
@@ -482,7 +423,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
482
423
  stream_descriptor=StreamDescriptor(name="episodes", namespace="public"),
483
424
  stream_state=AirbyteStateBlob(),
484
425
  ),
485
- data={"episodes": {}},
486
426
  ),
487
427
  ),
488
428
  id="test_always_emit_message_with_stream_state_blob",
@@ -499,7 +439,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
499
439
  ],
500
440
  "missing",
501
441
  "public",
502
- True,
503
442
  AirbyteMessage(
504
443
  type=MessageType.STATE,
505
444
  state=AirbyteStateMessage(
@@ -507,7 +446,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
507
446
  stream=AirbyteStreamState(
508
447
  stream_descriptor=StreamDescriptor(name="missing", namespace="public"), stream_state=AirbyteStateBlob()
509
448
  ),
510
- data={"episodes": {"id": 507}},
511
449
  ),
512
450
  ),
513
451
  id="test_emit_state_nonexistent_stream_name",
@@ -524,7 +462,6 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
524
462
  ],
525
463
  "episodes",
526
464
  "nonexistent",
527
- True,
528
465
  AirbyteMessage(
529
466
  type=MessageType.STATE,
530
467
  state=AirbyteStateMessage(
@@ -532,72 +469,14 @@ def test_update_state_for_stream(start_state, update_name, update_namespace, upd
532
469
  stream=AirbyteStreamState(
533
470
  stream_descriptor=StreamDescriptor(name="episodes", namespace="nonexistent"), stream_state=AirbyteStateBlob()
534
471
  ),
535
- data={"episodes": {"id": 507}},
536
472
  ),
537
473
  ),
538
474
  id="test_emit_state_wrong_namespace",
539
475
  ),
540
- pytest.param(
541
- [
542
- AirbyteStateMessage(
543
- type=AirbyteStateType.STREAM,
544
- stream=AirbyteStreamState(
545
- stream_descriptor=StreamDescriptor(name="episodes", namespace=None),
546
- stream_state=AirbyteStateBlob.parse_obj({"created_at": "2022_05_22"}),
547
- ),
548
- ),
549
- AirbyteStateMessage(
550
- type=AirbyteStateType.STREAM,
551
- stream=AirbyteStreamState(
552
- stream_descriptor=StreamDescriptor(name="seasons", namespace=None),
553
- stream_state=AirbyteStateBlob.parse_obj({"id": 1}),
554
- ),
555
- ),
556
- ],
557
- "episodes",
558
- "",
559
- False,
560
- AirbyteMessage(
561
- type=MessageType.STATE,
562
- state=AirbyteStateMessage(
563
- data={"episodes": {"created_at": "2022_05_22"}, "seasons": {"id": 1}},
564
- ),
565
- ),
566
- id="test_emit_legacy_state_format",
567
- ),
568
476
  ],
569
477
  )
570
- def test_create_state_message(start_state, update_name, update_namespace, send_per_stream, expected_state_message):
478
+ def test_create_state_message(start_state, update_name, update_namespace, expected_state_message):
571
479
  state_manager = ConnectorStateManager({}, start_state)
572
480
 
573
- actual_state_message = state_manager.create_state_message(
574
- stream_name=update_name, namespace=update_namespace, send_per_stream_state=send_per_stream
575
- )
481
+ actual_state_message = state_manager.create_state_message(stream_name=update_name, namespace=update_namespace)
576
482
  assert actual_state_message == expected_state_message
577
-
578
-
579
- def test_do_not_set_stream_descriptor_namespace_when_none():
580
- """
581
- This is a very specific test to ensure that the None value is not set and emitted back to the platform for namespace.
582
- The platform performs validation on the state message sent by the connector and namespace must be a string or not
583
- included at all. The None value registers as null by the platform which is not valid input. We can verify that fields
584
- on a pydantic model are not defined using exclude_unset parameter.
585
- """
586
- expected_stream_state_descriptor = {"name": "episodes"}
587
-
588
- state_manager = ConnectorStateManager(
589
- {},
590
- [
591
- AirbyteStateMessage(
592
- type=AirbyteStateType.STREAM,
593
- stream=AirbyteStreamState(
594
- stream_descriptor=StreamDescriptor(name="episodes"),
595
- stream_state=None,
596
- ),
597
- ),
598
- ],
599
- )
600
-
601
- actual_state_message = state_manager.create_state_message(stream_name="episodes", namespace=None, send_per_stream_state=True)
602
-
603
- assert actual_state_message.state.stream.stream_descriptor.dict(exclude_unset=True) == expected_stream_state_descriptor
@@ -365,8 +365,8 @@ def test_internal_config(abstract_source, catalog):
365
365
  # Test with empty config
366
366
  logger = logging.getLogger(f"airbyte.{getattr(abstract_source, 'name', '')}")
367
367
  records = [r for r in abstract_source.read(logger=logger, config={}, catalog=catalog, state={})]
368
- # 3 for http stream, 3 for non http stream and 3 for stream status messages for each stream (2x)
369
- assert len(records) == 3 + 3 + 3 + 3
368
+ # 3 for http stream, 3 for non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x)
369
+ assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
370
370
  assert http_stream.read_records.called
371
371
  assert non_http_stream.read_records.called
372
372
  # Make sure page_size havent been set
@@ -375,21 +375,21 @@ def test_internal_config(abstract_source, catalog):
375
375
  # Test with records limit set to 1
376
376
  internal_config = {"some_config": 100, "_limit": 1}
377
377
  records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})]
378
- # 1 from http stream + 1 from non http stream and 3 for stream status messages for each stream (2x)
379
- assert len(records) == 1 + 1 + 3 + 3
378
+ # 1 from http stream + 1 from non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x)
379
+ assert len(records) == 1 + 1 + 1 + 1 + 3 + 3
380
380
  assert "_limit" not in abstract_source.streams_config
381
381
  assert "some_config" in abstract_source.streams_config
382
382
  # Test with records limit set to number that exceeds expceted records
383
383
  internal_config = {"some_config": 100, "_limit": 20}
384
384
  records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})]
385
- assert len(records) == 3 + 3 + 3 + 3
385
+ assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
386
386
 
387
387
  # Check if page_size paramter is set to http instance only
388
388
  internal_config = {"some_config": 100, "_page_size": 2}
389
389
  records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})]
390
390
  assert "_page_size" not in abstract_source.streams_config
391
391
  assert "some_config" in abstract_source.streams_config
392
- assert len(records) == 3 + 3 + 3 + 3
392
+ assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
393
393
  assert http_stream.page_size == 2
394
394
  # Make sure page_size havent been set for non http streams
395
395
  assert not non_http_stream.page_size
@@ -403,6 +403,7 @@ def test_internal_config_limit(mocker, abstract_source, catalog):
403
403
  SLICE_DEBUG_LOG_COUNT = 1
404
404
  FULL_RECORDS_NUMBER = 3
405
405
  TRACE_STATUS_COUNT = 3
406
+ STATE_COUNT = 1
406
407
  streams = abstract_source.streams(None)
407
408
  http_stream = streams[0]
408
409
  http_stream.read_records.return_value = [{}] * FULL_RECORDS_NUMBER
@@ -410,7 +411,7 @@ def test_internal_config_limit(mocker, abstract_source, catalog):
410
411
 
411
412
  catalog.streams[0].sync_mode = SyncMode.full_refresh
412
413
  records = [r for r in abstract_source.read(logger=logger_mock, config=internal_config, catalog=catalog, state={})]
413
- assert len(records) == STREAM_LIMIT + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT
414
+ assert len(records) == STREAM_LIMIT + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT
414
415
  logger_info_args = [call[0][0] for call in logger_mock.info.call_args_list]
415
416
  # Check if log line matches number of limit
416
417
  read_log_record = [_l for _l in logger_info_args if _l.startswith("Read")]
@@ -440,6 +441,7 @@ SCHEMA = {"type": "object", "properties": {"value": {"type": "string"}}}
440
441
  def test_source_config_no_transform(mocker, abstract_source, catalog):
441
442
  SLICE_DEBUG_LOG_COUNT = 1
442
443
  TRACE_STATUS_COUNT = 3
444
+ STATE_COUNT = 1
443
445
  logger_mock = mocker.MagicMock()
444
446
  logger_mock.level = logging.DEBUG
445
447
  streams = abstract_source.streams(None)
@@ -447,7 +449,7 @@ def test_source_config_no_transform(mocker, abstract_source, catalog):
447
449
  http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
448
450
  http_stream.read_records.return_value, non_http_stream.read_records.return_value = [[{"value": 23}] * 5] * 2
449
451
  records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
450
- assert len(records) == 2 * (5 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT)
452
+ assert len(records) == 2 * (5 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT)
451
453
  assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": 23}] * 2 * 5
452
454
  assert http_stream.get_json_schema.call_count == 5
453
455
  assert non_http_stream.get_json_schema.call_count == 5
@@ -458,6 +460,7 @@ def test_source_config_transform(mocker, abstract_source, catalog):
458
460
  logger_mock.level = logging.DEBUG
459
461
  SLICE_DEBUG_LOG_COUNT = 2
460
462
  TRACE_STATUS_COUNT = 6
463
+ STATE_COUNT = 2
461
464
  streams = abstract_source.streams(None)
462
465
  http_stream, non_http_stream = streams
463
466
  http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
@@ -465,7 +468,7 @@ def test_source_config_transform(mocker, abstract_source, catalog):
465
468
  http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
466
469
  http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}]
467
470
  records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
468
- assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT
471
+ assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT
469
472
  assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": "23"}] * 2
470
473
 
471
474
 
@@ -474,13 +477,14 @@ def test_source_config_transform_and_no_transform(mocker, abstract_source, catal
474
477
  logger_mock.level = logging.DEBUG
475
478
  SLICE_DEBUG_LOG_COUNT = 2
476
479
  TRACE_STATUS_COUNT = 6
480
+ STATE_COUNT = 2
477
481
  streams = abstract_source.streams(None)
478
482
  http_stream, non_http_stream = streams
479
483
  http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
480
484
  http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
481
485
  http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}]
482
486
  records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
483
- assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT
487
+ assert len(records) == 2 + SLICE_DEBUG_LOG_COUNT + TRACE_STATUS_COUNT + STATE_COUNT
484
488
  assert [r.record.data for r in records if r.type == Type.RECORD] == [{"value": "23"}, {"value": 23}]
485
489
 
486
490
 
@@ -526,8 +530,8 @@ def test_read_default_http_availability_strategy_stream_available(catalog, mocke
526
530
  source = MockAbstractSource(streams=streams)
527
531
  logger = logging.getLogger(f"airbyte.{getattr(abstract_source, 'name', '')}")
528
532
  records = [r for r in source.read(logger=logger, config={}, catalog=catalog, state={})]
529
- # 3 for http stream, 3 for non http stream and 3 for stream status messages for each stream (2x)
530
- assert len(records) == 3 + 3 + 3 + 3
533
+ # 3 for http stream, 3 for non http stream, 1 for state message for each stream (2x) and 3 for stream status messages for each stream (2x)
534
+ assert len(records) == 3 + 3 + 1 + 1 + 3 + 3
531
535
  assert http_stream.read_records.called
532
536
  assert non_http_stream.read_records.called
533
537
 
@@ -584,8 +588,8 @@ def test_read_default_http_availability_strategy_stream_unavailable(catalog, moc
584
588
  with caplog.at_level(logging.WARNING):
585
589
  records = [r for r in source.read(logger=logger, config={}, catalog=catalog, state={})]
586
590
 
587
- # 0 for http stream, 3 for non http stream and 3 status trace messages
588
- assert len(records) == 0 + 3 + 3
591
+ # 0 for http stream, 3 for non http stream, 1 for non http stream state message and 3 status trace messages
592
+ assert len(records) == 0 + 3 + 1 + 3
589
593
  assert non_http_stream.read_records.called
590
594
  expected_logs = [
591
595
  f"Skipped syncing stream '{http_stream.name}' because it was unavailable.",