PyPI - airbyte-cdk - Versions diffs - 0.61.2__py3-none-any.whl → 0.62.1__py3-none-any.whl - Mend

airbyte-cdk 0.61.2py3-none-any.whl → 0.62.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of airbyte-cdk might be problematic. Click here for more details.

Files changed (33) hide show

unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py ADDED Viewed

@@ -0,0 +1,462 @@
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+from datetime import datetime
+from typing import Any, Dict, List, MutableMapping, Optional, Tuple
+from unittest.mock import MagicMock
+import pytest
+from airbyte_cdk.models import AirbyteStateMessage, SyncMode
+from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
+from airbyte_cdk.sources.file_based.remote_file import RemoteFile
+from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
+from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedConcurrentCursor
+from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
+from freezegun import freeze_time
+DATE_TIME_FORMAT = FileBasedConcurrentCursor.DATE_TIME_FORMAT
+MOCK_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3
+def _make_cursor(input_state: Optional[MutableMapping[str, Any]]) -> FileBasedConcurrentCursor:
+    stream = MagicMock()
+    stream.name = "test"
+    stream.namespace = None
+    stream_config = MagicMock()
+    stream_config.days_to_sync_if_history_is_full = MOCK_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
+    cursor = FileBasedConcurrentCursor(
+        stream_config,
+        stream.name,
+        None,
+        input_state,
+        MagicMock(),
+        ConnectorStateManager(
+            stream_instance_map={stream.name: stream},
+            state=[AirbyteStateMessage.parse_obj(input_state)] if input_state is not None else None,
+        ),
+        CursorField(FileBasedConcurrentCursor.CURSOR_FIELD),
+    )
+    return cursor
+@pytest.mark.parametrize(
+    "input_state, expected_cursor_value",
+    [
+        pytest.param({}, (datetime.min, ""), id="no-state-gives-min-cursor"),
+        pytest.param({"history": {}}, (datetime.min, ""), id="missing-cursor-field-gives-min-cursor"),
+        pytest.param(
+            {
+                "history": {"a.csv": "2021-01-01T00:00:00.000000Z"},
+                "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
+            },
+            (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
+            id="cursor-value-matches-earliest-file",
+        ),
+        pytest.param(
+            {
+                "history": {"a.csv": "2021-01-01T00:00:00.000000Z"},
+                "_ab_source_file_last_modified": "2020-01-01T00:00:00.000000Z_a.csv"
+            },
+            (datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
+            id="cursor-value-is-earlier",
+        ),
+        pytest.param(
+            {
+                "history": {"a.csv": "2022-01-01T00:00:00.000000Z"},
+                "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
+            },
+            (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
+            id="cursor-value-is-later",
+        ),
+        pytest.param(
+            {
+                "history": {
+                    "a.csv": "2021-01-01T00:00:00.000000Z",
+                    "b.csv": "2021-01-02T00:00:00.000000Z",
+                    "c.csv": "2021-01-03T00:00:00.000000Z"
+                },
+                "_ab_source_file_last_modified": "2021-01-04T00:00:00.000000Z_d.csv"
+            },
+            (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
+            id="cursor-not-earliest",
+        ),
+        pytest.param(
+            {
+                "history": {"b.csv": "2020-12-31T00:00:00.000000Z"},
+                "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
+            },
+            (datetime.strptime("2020-12-31T00:00:00.000000Z", DATE_TIME_FORMAT), "b.csv"),
+            id="state-with-cursor-and-earlier-history"
+        ),
+        pytest.param(
+            {
+                "history": {"b.csv": "2021-01-02T00:00:00.000000Z"},
+                "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
+            },
+            (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
+            id="state-with-cursor-and-later-history"
+        ),
+    ]
+)
+def test_compute_prev_sync_cursor(input_state: MutableMapping[str, Any], expected_cursor_value: Tuple[datetime, str]):
+    cursor = _make_cursor(input_state)
+    assert cursor._compute_prev_sync_cursor(input_state) == expected_cursor_value
+@pytest.mark.parametrize(
+    "initial_state, pending_files, file_to_add, expected_history, expected_pending_files, expected_cursor_value",
+    [
+        pytest.param(
+            {"history": {}},
+            [("newfile.csv", "2021-01-05T00:00:00.000000Z")],
+            ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
+            {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
+            [],
+            "2021-01-05T00:00:00.000000Z_newfile.csv",
+            id="add-to-empty-history-single-pending-file",
+        ),
+        pytest.param(
+            {"history": {}},
+            [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2020-01-05T00:00:00.000000Z")],
+            ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
+            {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
+            [("pending.csv", "2020-01-05T00:00:00.000000Z")],
+            "2020-01-05T00:00:00.000000Z_pending.csv",
+            id="add-to-empty-history-pending-file-is-older",
+        ),
+        pytest.param(
+            {"history": {}},
+            [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2022-01-05T00:00:00.000000Z")],
+            ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
+            {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
+            [("pending.csv", "2022-01-05T00:00:00.000000Z")],
+            "2022-01-05T00:00:00.000000Z_pending.csv",
+            id="add-to-empty-history-pending-file-is-newer",
+        ),
+        pytest.param(
+            {"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
+            [("newfile.csv", "2021-01-05T00:00:00.000000Z")],
+            ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
+            {"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
+            [],
+            "2021-01-05T00:00:00.000000Z_newfile.csv",
+            id="add-to-nonempty-history-single-pending-file",
+        ),
+        pytest.param(
+            {"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
+            [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2020-01-05T00:00:00.000000Z")],
+            ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
+            {"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
+            [("pending.csv", "2020-01-05T00:00:00.000000Z")],
+            "2020-01-05T00:00:00.000000Z_pending.csv",
+            id="add-to-nonempty-history-pending-file-is-older",
+        ),
+        pytest.param(
+            {"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
+            [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2022-01-05T00:00:00.000000Z")],
+            ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
+            {"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
+            [("pending.csv", "2022-01-05T00:00:00.000000Z")],
+            "2022-01-05T00:00:00.000000Z_pending.csv",
+            id="add-to-nonempty-history-pending-file-is-newer",
+        ),
+    ]
+)
+def test_add_file(
+    initial_state: MutableMapping[str, Any],
+    pending_files: List[Tuple[str, str]],
+    file_to_add: Tuple[str, str],
+    expected_history: Dict[str, Any],
+    expected_pending_files: List[Tuple[str, str]],
+    expected_cursor_value: str,
+):
+    cursor = _make_cursor(initial_state)
+    mock_message_repository = MagicMock()
+    cursor._message_repository = mock_message_repository
+    stream = MagicMock()
+    cursor.set_pending_partitions([
+        FileBasedStreamPartition(
+            stream,
+            {"files": [RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))]},
+            mock_message_repository,
+            SyncMode.full_refresh,
+            FileBasedConcurrentCursor.CURSOR_FIELD,
+            initial_state,
+            cursor
+        ) for uri, timestamp in pending_files
+    ])
+    uri, timestamp = file_to_add
+    cursor.add_file(RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)))
+    assert cursor._file_to_datetime_history == expected_history
+    assert cursor._pending_files == {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in expected_pending_files}
+    assert mock_message_repository.emit_message.call_args_list[0].args[0].state.data["test"]["_ab_source_file_last_modified"] == expected_cursor_value
+@pytest.mark.parametrize(
+    "initial_state, pending_files, file_to_add, expected_history, expected_pending_files, expected_cursor_value",
+    [
+        pytest.param(
+            {"history": {}},
+            [],
+            ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
+            {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
+            [],
+            "2021-01-05T00:00:00.000000Z_newfile.csv",
+            id="add-to-empty-history-no-pending-files",
+        ),
+        pytest.param(
+            {"history": {}},
+            [("pending.csv", "2021-01-05T00:00:00.000000Z")],
+            ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
+            {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
+            [("pending.csv", "2021-01-05T00:00:00.000000Z")],
+            "2021-01-05T00:00:00.000000Z_pending.csv",
+            id="add-to-empty-history-file-not-in-pending-files",
+        ),
+    ]
+)
+def test_add_file_invalid(
+    initial_state: MutableMapping[str, Any],
+    pending_files: List[Tuple[str, str]],
+    file_to_add: Tuple[str, str],
+    expected_history: Dict[str, Any],
+    expected_pending_files: List[Tuple[str, str]],
+    expected_cursor_value: str,
+):
+    cursor = _make_cursor(initial_state)
+    cursor._pending_files = {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in pending_files}
+    mock_message_repository = MagicMock()
+    cursor._message_repository = mock_message_repository
+    uri, timestamp = file_to_add
+    cursor.add_file(RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)))
+    assert cursor._file_to_datetime_history == expected_history
+    assert cursor._pending_files == {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in expected_pending_files}
+    assert mock_message_repository.emit_message.call_args_list[0].args[0].log.level.value == "WARN"
+    assert mock_message_repository.emit_message.call_args_list[1].args[0].state.data["test"]["_ab_source_file_last_modified"] == expected_cursor_value
+@pytest.mark.parametrize(
+    "input_state, pending_files, expected_cursor_value",
+    [
+        pytest.param({}, [], f"{datetime.min.strftime('%Y-%m-%dT%H:%M:%S.%fZ')}_", id="no-state-no-pending"),
+        pytest.param(
+            {"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}},
+            [],
+            "2021-01-01T00:00:00.000000Z_a.csv",
+            id="no-pending-with-history"
+        ),
+        pytest.param(
+            {"history": {}},
+            [("b.csv", "2021-01-02T00:00:00.000000Z")],
+            "2021-01-02T00:00:00.000000Z_b.csv",
+            id="pending-no-history"
+        ),
+        pytest.param(
+            {"history": {"a.csv": "2022-01-01T00:00:00.000000Z"}},
+            [("b.csv", "2021-01-02T00:00:00.000000Z")],
+            "2021-01-01T00:00:00.000000Z_a.csv",
+            id="with-pending-before-history"
+        ),
+        pytest.param(
+            {"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}},
+            [("b.csv", "2022-01-02T00:00:00.000000Z")],
+            "2022-01-01T00:00:00.000000Z_a.csv",
+            id="with-pending-after-history"
+        ),
+    ]
+)
+def test_get_new_cursor_value(input_state: MutableMapping[str, Any], pending_files: List[Tuple[str, str]], expected_cursor_value: str):
+    cursor = _make_cursor(input_state)
+    pending_partitions = []
+    for url, timestamp in pending_files:
+        partition = MagicMock()
+        partition.to_slice = lambda *args, **kwargs: {"files": [RemoteFile(uri=url, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))]}
+        pending_partitions.append(partition)
+    cursor.set_pending_partitions(pending_partitions)
+@pytest.mark.parametrize(
+    "all_files, history, is_history_full, prev_cursor_value, expected_files_to_sync",
+    [
+        pytest.param(
+            [RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
+            {},
+            False,
+            (datetime.min, ""),
+            ["new.csv"],
+            id="empty-history-one-new-file"
+        ),
+        pytest.param(
+            [RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
+            {"a.csv": "2021-01-01T00:00:00.000000Z"},
+            False,
+            (datetime.min, ""),
+            ["a.csv"],
+            id="non-empty-history-file-in-history-modified"
+        ),
+        pytest.param(
+            [RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
+            {"a.csv": "2021-01-01T00:00:00.000000Z"},
+            False,
+            (datetime.min, ""),
+            [],
+            id="non-empty-history-file-in-history-not-modified"
+        ),
+    ]
+)
+def test_get_files_to_sync(all_files, history, is_history_full, prev_cursor_value, expected_files_to_sync):
+    cursor = _make_cursor({})
+    cursor._file_to_datetime_history = history
+    cursor._prev_cursor_value = prev_cursor_value
+    cursor._is_history_full = MagicMock(return_value=is_history_full)
+    files_to_sync = list(cursor.get_files_to_sync(all_files, MagicMock()))
+    assert [f.uri for f in files_to_sync] == expected_files_to_sync
+@freeze_time("2023-06-16T00:00:00Z")
+@pytest.mark.parametrize(
+    "file_to_check, history, is_history_full, prev_cursor_value, sync_start, expected_should_sync",
+    [
+        pytest.param(
+            RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", DATE_TIME_FORMAT)),
+            {},
+            False,
+            (datetime.min, ""),
+            datetime.min,
+            True,
+            id="file-not-in-history-not-full-old-cursor"
+        ),
+        pytest.param(
+            RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", DATE_TIME_FORMAT)),
+            {},
+            False,
+            (datetime.strptime("2024-01-02T00:00:00.000000Z", DATE_TIME_FORMAT), ""),
+            datetime.min,
+            True,
+            id="file-not-in-history-not-full-new-cursor"
+        ),
+        pytest.param(
+            RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
+            {"a.csv": "2021-01-01T00:00:00.000000Z"},
+            False,
+            (datetime.min, ""),
+            datetime.min,
+            False,
+            id="file-in-history-not-modified"
+        ),
+        pytest.param(
+            RemoteFile(uri="a.csv", last_modified=datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
+            {"a.csv": "2021-01-01T00:00:00.000000Z"},
+            False,
+            (datetime.min, ""),
+            datetime.min,
+            False,
+            id="file-in-history-modified-before"
+        ),
+        pytest.param(
+            RemoteFile(uri="a.csv", last_modified=datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
+            {"a.csv": "2021-01-01T00:00:00.000000Z"},
+            False,
+            (datetime.min, ""),
+            datetime.min,
+            True,
+            id="file-in-history-modified-after"
+        ),
+        pytest.param(
+            RemoteFile(uri="new.csv", last_modified=datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
+            {},
+            True,
+            (datetime.strptime("2021-01-02T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
+            datetime.min,
+            True,
+            id="history-full-file-modified-after-cursor"
+        ),
+        pytest.param(
+            RemoteFile(uri="new1.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
+            {},
+            True,
+            (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "new0.csv"),
+            datetime.min,
+            True,
+            id="history-full-modified-eq-cursor-uri-gt"
+        ),
+        pytest.param(
+            RemoteFile(uri="new0.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
+            {},
+            True,
+            (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "new1.csv"),
+            datetime.min,
+            False,
+            id="history-full-modified-eq-cursor-uri-lt"
+        ),
+        pytest.param(
+            RemoteFile(uri="new.csv", last_modified=datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
+            {},
+            True,
+            (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
+            datetime.min,
+            True,
+            id="history-full-modified-before-cursor-and-after-sync-start"
+        ),
+        pytest.param(
+            RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
+            {},
+            True,
+            (datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
+            datetime.strptime("2024-01-01T00:00:00.000000Z", DATE_TIME_FORMAT),
+            False,
+            id="history-full-modified-before-cursor-and-before-sync-start"
+        ),
+    ]
+)
+def test_should_sync_file(
+    file_to_check: RemoteFile,
+    history: Dict[str, Any],
+    is_history_full: bool,
+    prev_cursor_value: Tuple[datetime, str],
+    sync_start: datetime,
+    expected_should_sync: bool,
+):
+    cursor = _make_cursor({})
+    cursor._file_to_datetime_history = history
+    cursor._prev_cursor_value = prev_cursor_value
+    cursor._sync_start = sync_start
+    cursor._is_history_full = MagicMock(return_value=is_history_full)
+    should_sync = cursor._should_sync_file(file_to_check, MagicMock())
+    assert should_sync == expected_should_sync
+@freeze_time("2023-06-16T00:00:00Z")
+@pytest.mark.parametrize(
+    "input_history, is_history_full, expected_start_time",
+    [
+        pytest.param({}, False, datetime.min, id="empty-history"),
+        pytest.param(
+            {"a.csv": "2021-01-01T00:00:00.000000Z"},
+            False,
+            datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT),
+            id="non-full-history"
+        ),
+        pytest.param(
+            {f"file{i}.csv": f"2021-01-0{i}T00:00:00.000000Z" for i in range(1, 4)},  # all before the time window
+            True,
+            datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT),  # Time window start time
+            id="full-history-earliest-before-window"
+        ),
+        pytest.param(
+            {f"file{i}.csv": f"2024-01-0{i}T00:00:00.000000Z" for i in range(1, 4)},  # all after the time window
+            True,
+            datetime.strptime("2023-06-13T00:00:00.000000Z", DATE_TIME_FORMAT),  # Earliest file time
+            id="full-history-earliest-after-window"
+        ),
+    ]
+)
+def test_compute_start_time(input_history, is_history_full, expected_start_time, monkeypatch):
+    cursor = _make_cursor({"history": input_history})
+    cursor._file_to_datetime_history = input_history
+    cursor._is_history_full = MagicMock(return_value=is_history_full)
+    assert cursor._compute_start_time() == expected_start_time

unit_tests/sources/file_based/test_file_based_scenarios.py CHANGED Viewed

@@ -26,6 +26,29 @@ from unit_tests.sources.file_based.scenarios.check_scenarios import (
     success_multi_stream_scenario,
     success_user_provided_schema_scenario,
 )
+from unit_tests.sources.file_based.scenarios.concurrent_incremental_scenarios import (
+    multi_csv_different_timestamps_scenario_concurrent,
+    multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer,
+    multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older,
+    multi_csv_per_timestamp_scenario_concurrent,
+    multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer,
+    multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older,
+    multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer,
+    multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older,
+    multi_csv_same_timestamp_scenario_concurrent,
+    multi_csv_skip_file_if_already_in_history_concurrent,
+    multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer,
+    multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older,
+    multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer,
+    multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older,
+    multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer,
+    multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older,
+    single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent,
+    single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent,
+    single_csv_input_state_is_earlier_scenario_concurrent,
+    single_csv_input_state_is_later_scenario_concurrent,
+    single_csv_no_input_state_scenario_concurrent,
+)
 from unit_tests.sources.file_based.scenarios.csv_scenarios import (
     csv_autogenerate_column_names_scenario,
     csv_custom_bool_values_scenario,
@@ -214,6 +237,28 @@ discover_scenarios = [
     unstructured_invalid_file_type_discover_scenario_no_skip,
     unstructured_invalid_file_type_discover_scenario_skip,
     unstructured_invalid_file_type_read_scenario,
+    multi_csv_different_timestamps_scenario_concurrent,
+    multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer,
+    multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older,
+    multi_csv_per_timestamp_scenario_concurrent,
+    multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer,
+    multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older,
+    multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer,
+    multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older,
+    multi_csv_same_timestamp_scenario_concurrent,
+    multi_csv_skip_file_if_already_in_history_concurrent,
+    multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer,
+    multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older,
+    multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer,
+    multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older,
+    multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer,
+    multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older,
+    single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent,
+    single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent,
+    single_csv_input_state_is_earlier_scenario_concurrent,
+    single_csv_input_state_is_later_scenario_concurrent,
+    single_csv_no_input_state_scenario_concurrent,
 ]
 read_scenarios = discover_scenarios + [

unit_tests/sources/file_based/test_scenarios.py CHANGED Viewed

@@ -13,6 +13,7 @@ from _pytest.reports import ExceptionInfo
 from airbyte_cdk.entrypoint import launch
 from airbyte_cdk.models import AirbyteAnalyticsTraceMessage, SyncMode
 from airbyte_cdk.sources import AbstractSource
+from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor
 from airbyte_cdk.test.entrypoint_wrapper import EntrypointOutput
 from airbyte_cdk.test.entrypoint_wrapper import read as entrypoint_read
 from airbyte_cdk.utils.traced_exception import AirbyteTracedException
@@ -72,12 +73,10 @@ def assert_exception(expected_exception: type[BaseException], output: Entrypoint
 def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[AbstractSource]) -> None:
     records, log_messages = output.records_and_state_messages, output.logs
     logs = [message.log for message in log_messages if message.log.level.value in scenario.log_levels]
-    expected_records = scenario.expected_records
-    if expected_records is None:
+    if scenario.expected_records is None:
         return
-    assert len(records) == len(expected_records)
+    expected_records = [r for r in scenario.expected_records] if scenario.expected_records else []
     sorted_expected_records = sorted(
         filter(lambda e: "data" in e, expected_records),
@@ -87,6 +86,9 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
         filter(lambda r: r.record, records),
         key=lambda record: ",".join(f"{k}={v}" for k, v in sorted(record.record.data.items(), key=lambda items: (items[0], items[1])) if k != "emitted_at"),
     )
+    assert len(sorted_records) == len(sorted_expected_records)
     for actual, expected in zip(sorted_records, sorted_expected_records):
         if actual.record:
             assert len(actual.record.data) == len(expected["data"])
@@ -97,10 +99,16 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
                     assert value == expected["data"][key]
             assert actual.record.stream == expected["stream"]
-    expected_states = filter(lambda e: "data" not in e, expected_records)
-    states = filter(lambda r: r.state, records)
-    for actual, expected in zip(states, expected_states):  # states should be emitted in sorted order
-        assert actual.state.data == expected
+    expected_states = list(filter(lambda e: "data" not in e, expected_records))
+    states = list(filter(lambda r: r.state, records))
+    if hasattr(scenario.source, "cursor_cls") and issubclass(scenario.source.cursor_cls, AbstractConcurrentFileBasedCursor):
+        # Only check the last state emitted because we don't know the order the others will be in.
+        # This may be needed for non-file-based concurrent scenarios too.
+        assert states[-1].state.data == expected_states[-1]
+    else:
+        for actual, expected in zip(states, expected_states):  # states should be emitted in sorted order
+            assert actual.state.data == expected
     if scenario.expected_logs:
         read_logs = scenario.expected_logs.get("read")

unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py CHANGED Viewed

@@ -5,7 +5,14 @@ import concurrent
 import logging
 from typing import Any, List, Mapping, Optional, Tuple, Union
-from airbyte_cdk.models import AirbyteStateMessage, ConfiguredAirbyteCatalog, ConnectorSpecification, DestinationSyncMode, SyncMode
+from airbyte_cdk.models import (
+    AirbyteStateMessage,
+    AirbyteStream,
+    ConfiguredAirbyteCatalog,
+    ConnectorSpecification,
+    DestinationSyncMode,
+    SyncMode,
+)
 from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
 from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
 from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
@@ -51,7 +58,11 @@ class StreamFacadeSource(ConcurrentSourceAdapter):
         return True, None
     def streams(self, config: Mapping[str, Any]) -> List[Stream]:
-        state_manager = ConnectorStateManager(stream_instance_map={s.name: s for s in self._streams}, state=self._state)
+        state_manager = ConnectorStateManager(
+            stream_instance_map={s.name: AirbyteStream(name=s.name, namespace=None, json_schema={}, supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental]) for s in self._streams},
+            state=self._state,
+        )  # The input values into the AirbyteStream are dummy values; the connector state manager only uses `name` and `namespace`
         state_converter = StreamFacadeConcurrentConnectorStateConverter()
         stream_states = [state_manager.get_stream_state(stream.name, stream.namespace) for stream in self._streams]
         return [

airbyte-cdk 0.61.2__py3-none-any.whl → 0.62.1__py3-none-any.whl

Potentially problematic release.

airbyte-cdk 0.61.2py3-none-any.whl → 0.62.1py3-none-any.whl