airbyte-cdk 0.68.4__py3-none-any.whl → 0.69.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/entrypoint.py +27 -7
- airbyte_cdk/sources/connector_state_manager.py +0 -1
- airbyte_cdk/sources/file_based/file_based_source.py +4 -2
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +2 -2
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +2 -2
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/{file_based_noop_cursor.py → file_based_final_state_cursor.py} +21 -6
- airbyte_cdk/sources/streams/concurrent/adapters.py +2 -2
- airbyte_cdk/sources/streams/concurrent/cursor.py +27 -3
- airbyte_cdk/sources/streams/concurrent/default_stream.py +7 -3
- airbyte_cdk/test/entrypoint_wrapper.py +1 -1
- airbyte_cdk/utils/message_utils.py +17 -0
- {airbyte_cdk-0.68.4.dist-info → airbyte_cdk-0.69.1.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.68.4.dist-info → airbyte_cdk-0.69.1.dist-info}/RECORD +30 -28
- {airbyte_cdk-0.68.4.dist-info → airbyte_cdk-0.69.1.dist-info}/WHEEL +1 -1
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -2
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +128 -37
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +3 -3
- unit_tests/sources/file_based/test_file_based_scenarios.py +13 -6
- unit_tests/sources/file_based/test_scenarios.py +32 -3
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +2 -2
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +16 -14
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +5 -4
- unit_tests/sources/streams/concurrent/test_default_stream.py +8 -6
- unit_tests/sources/streams/test_stream_read.py +3 -2
- unit_tests/sources/test_concurrent_source.py +7 -5
- unit_tests/sources/test_source_read.py +2 -3
- unit_tests/test/test_entrypoint_wrapper.py +9 -6
- unit_tests/utils/test_message_utils.py +91 -0
- {airbyte_cdk-0.68.4.dist-info → airbyte_cdk-0.69.1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.68.4.dist-info → airbyte_cdk-0.69.1.dist-info}/top_level.txt +0 -0
@@ -467,30 +467,24 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
|
|
467
467
|
)
|
468
468
|
).build()
|
469
469
|
|
470
|
-
|
470
|
+
csv_analytics_scenario: TestScenario[InMemoryFilesSource] = (
|
471
471
|
TestScenarioBuilder[InMemoryFilesSource]()
|
472
|
-
.set_name("
|
472
|
+
.set_name("csv_analytics")
|
473
473
|
.set_config(
|
474
474
|
{
|
475
475
|
"streams": [
|
476
476
|
{
|
477
477
|
"name": "stream1",
|
478
478
|
"format": {"filetype": "csv"},
|
479
|
-
"globs": ["
|
479
|
+
"globs": ["a.csv"],
|
480
480
|
"validation_policy": "Emit Record",
|
481
481
|
},
|
482
482
|
{
|
483
483
|
"name": "stream2",
|
484
484
|
"format": {"filetype": "csv"},
|
485
|
-
"globs": ["
|
486
|
-
"validation_policy": "Emit Record",
|
487
|
-
},
|
488
|
-
{
|
489
|
-
"name": "stream3",
|
490
|
-
"format": {"filetype": "jsonl"},
|
491
|
-
"globs": ["file3.jsonl"],
|
485
|
+
"globs": ["b.csv"],
|
492
486
|
"validation_policy": "Emit Record",
|
493
|
-
}
|
487
|
+
}
|
494
488
|
]
|
495
489
|
}
|
496
490
|
)
|
@@ -498,17 +492,21 @@ multi_format_analytics_scenario: TestScenario[InMemoryFilesSource] = (
|
|
498
492
|
FileBasedSourceBuilder()
|
499
493
|
.set_files(
|
500
494
|
{
|
501
|
-
"
|
502
|
-
"contents": [
|
495
|
+
"a.csv": {
|
496
|
+
"contents": [
|
497
|
+
("col1", "col2"),
|
498
|
+
("val11a", "val12a"),
|
499
|
+
("val21a", "val22a"),
|
500
|
+
],
|
503
501
|
"last_modified": "2023-06-05T03:54:07.000Z",
|
504
502
|
},
|
505
|
-
"
|
506
|
-
"contents": [
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
"last_modified": "2023-06-
|
503
|
+
"b.csv": {
|
504
|
+
"contents": [
|
505
|
+
("col1", "col2", "col3"),
|
506
|
+
("val11b", "val12b", "val13b"),
|
507
|
+
("val21b", "val22b", "val23b"),
|
508
|
+
],
|
509
|
+
"last_modified": "2023-06-05T03:54:07.000Z",
|
512
510
|
},
|
513
511
|
}
|
514
512
|
)
|
@@ -521,7 +519,12 @@ multi_format_analytics_scenario: TestScenario[InMemoryFilesSource] = (
|
|
521
519
|
"default_cursor_field": ["_ab_source_file_last_modified"],
|
522
520
|
"json_schema": {
|
523
521
|
"type": "object",
|
524
|
-
"properties": {
|
522
|
+
"properties": {
|
523
|
+
"col1": {"type": ["null", "string"]},
|
524
|
+
"col2": {"type": ["null", "string"]},
|
525
|
+
"_ab_source_file_last_modified": {"type": "string"},
|
526
|
+
"_ab_source_file_url": {"type": "string"},
|
527
|
+
},
|
525
528
|
},
|
526
529
|
"name": "stream1",
|
527
530
|
"source_defined_cursor": True,
|
@@ -531,30 +534,64 @@ multi_format_analytics_scenario: TestScenario[InMemoryFilesSource] = (
|
|
531
534
|
"default_cursor_field": ["_ab_source_file_last_modified"],
|
532
535
|
"json_schema": {
|
533
536
|
"type": "object",
|
534
|
-
"properties": {
|
537
|
+
"properties": {
|
538
|
+
"col1": {"type": ["null", "string"]},
|
539
|
+
"col2": {"type": ["null", "string"]},
|
540
|
+
"col3": {"type": ["null", "string"]},
|
541
|
+
"_ab_source_file_last_modified": {"type": "string"},
|
542
|
+
"_ab_source_file_url": {"type": "string"},
|
543
|
+
},
|
535
544
|
},
|
536
545
|
"name": "stream2",
|
537
546
|
"source_defined_cursor": True,
|
538
547
|
"supported_sync_modes": ["full_refresh", "incremental"],
|
539
|
-
}
|
540
|
-
{
|
541
|
-
"default_cursor_field": ["_ab_source_file_last_modified"],
|
542
|
-
"json_schema": {
|
543
|
-
"type": "object",
|
544
|
-
"properties": {},
|
545
|
-
},
|
546
|
-
"name": "stream3",
|
547
|
-
"source_defined_cursor": True,
|
548
|
-
"supported_sync_modes": ["full_refresh", "incremental"],
|
549
|
-
},
|
548
|
+
}
|
550
549
|
]
|
551
550
|
}
|
552
551
|
)
|
553
|
-
.set_expected_records([
|
552
|
+
.set_expected_records([
|
553
|
+
{
|
554
|
+
"data": {
|
555
|
+
"col1": "val11a",
|
556
|
+
"col2": "val12a",
|
557
|
+
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
558
|
+
"_ab_source_file_url": "a.csv",
|
559
|
+
},
|
560
|
+
"stream": "stream1",
|
561
|
+
},
|
562
|
+
{
|
563
|
+
"data": {
|
564
|
+
"col1": "val21a",
|
565
|
+
"col2": "val22a",
|
566
|
+
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
567
|
+
"_ab_source_file_url": "a.csv",
|
568
|
+
},
|
569
|
+
"stream": "stream1",
|
570
|
+
},
|
571
|
+
{
|
572
|
+
"data": {
|
573
|
+
"col1": "val11b",
|
574
|
+
"col2": "val12b",
|
575
|
+
"col3": "val13b",
|
576
|
+
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
577
|
+
"_ab_source_file_url": "b.csv",
|
578
|
+
},
|
579
|
+
"stream": "stream2",
|
580
|
+
},
|
581
|
+
{
|
582
|
+
"data": {
|
583
|
+
"col1": "val21b",
|
584
|
+
"col2": "val22b",
|
585
|
+
"col3": "val23b",
|
586
|
+
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
587
|
+
"_ab_source_file_url": "b.csv",
|
588
|
+
},
|
589
|
+
"stream": "stream2",
|
590
|
+
},
|
591
|
+
])
|
554
592
|
.set_expected_analytics(
|
555
593
|
[
|
556
594
|
AirbyteAnalyticsTraceMessage(type="file-cdk-csv-stream-count", value="2"),
|
557
|
-
AirbyteAnalyticsTraceMessage(type="file-cdk-jsonl-stream-count", value="1"),
|
558
595
|
]
|
559
596
|
)
|
560
597
|
).build()
|
@@ -1450,7 +1487,6 @@ empty_schema_inference_scenario: TestScenario[InMemoryFilesSource] = (
|
|
1450
1487
|
}
|
1451
1488
|
)
|
1452
1489
|
.set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
|
1453
|
-
.set_expected_records([])
|
1454
1490
|
).build()
|
1455
1491
|
|
1456
1492
|
schemaless_csv_scenario: TestScenario[InMemoryFilesSource] = (
|
@@ -3009,6 +3045,61 @@ earlier_csv_scenario: TestScenario[InMemoryFilesSource] = (
|
|
3009
3045
|
]
|
3010
3046
|
}
|
3011
3047
|
)
|
3012
|
-
.set_expected_records([])
|
3013
3048
|
.set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
|
3014
3049
|
).build()
|
3050
|
+
|
3051
|
+
csv_no_records_scenario: TestScenario[InMemoryFilesSource] = (
|
3052
|
+
TestScenarioBuilder[InMemoryFilesSource]()
|
3053
|
+
.set_name("csv_empty_no_records")
|
3054
|
+
.set_config(
|
3055
|
+
{
|
3056
|
+
"streams": [
|
3057
|
+
{
|
3058
|
+
"name": "stream1",
|
3059
|
+
"globs": ["*"],
|
3060
|
+
"validation_policy": "Emit Record",
|
3061
|
+
"input_schema": '{"col1": "boolean", "col2": "string"}',
|
3062
|
+
"format": {
|
3063
|
+
"filetype": "csv",
|
3064
|
+
"null_values": ["null"],
|
3065
|
+
},
|
3066
|
+
}
|
3067
|
+
],
|
3068
|
+
"start_date": "2023-06-04T03:54:07.000000Z",
|
3069
|
+
}
|
3070
|
+
)
|
3071
|
+
.set_source_builder(
|
3072
|
+
FileBasedSourceBuilder()
|
3073
|
+
.set_files(
|
3074
|
+
{
|
3075
|
+
"a.csv": {
|
3076
|
+
"contents": [("col1", "col2")], # column headers, but no data rows
|
3077
|
+
"last_modified": "2023-06-05T03:54:07.000Z",
|
3078
|
+
}
|
3079
|
+
}
|
3080
|
+
)
|
3081
|
+
.set_file_type("csv")
|
3082
|
+
)
|
3083
|
+
.set_expected_catalog(
|
3084
|
+
{
|
3085
|
+
"streams": [
|
3086
|
+
{
|
3087
|
+
"default_cursor_field": ["_ab_source_file_last_modified"],
|
3088
|
+
"json_schema": {
|
3089
|
+
"type": "object",
|
3090
|
+
"properties": {
|
3091
|
+
"col1": {"type": "boolean"},
|
3092
|
+
"col2": {"type": "string"},
|
3093
|
+
"_ab_source_file_last_modified": {"type": "string"},
|
3094
|
+
"_ab_source_file_url": {"type": "string"},
|
3095
|
+
},
|
3096
|
+
},
|
3097
|
+
"name": "stream1",
|
3098
|
+
"source_defined_cursor": True,
|
3099
|
+
"supported_sync_modes": ["full_refresh", "incremental"],
|
3100
|
+
}
|
3101
|
+
]
|
3102
|
+
}
|
3103
|
+
)
|
3104
|
+
.set_expected_records([])
|
3105
|
+
).build()
|
@@ -23,7 +23,7 @@ from airbyte_cdk.sources.file_based.stream.concurrent.adapters import (
|
|
23
23
|
FileBasedStreamPartition,
|
24
24
|
FileBasedStreamPartitionGenerator,
|
25
25
|
)
|
26
|
-
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import
|
26
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedFinalStateCursor
|
27
27
|
from airbyte_cdk.sources.message import InMemoryMessageRepository
|
28
28
|
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
|
29
29
|
from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage
|
@@ -36,7 +36,7 @@ _ANY_SYNC_MODE = SyncMode.full_refresh
|
|
36
36
|
_ANY_STATE = {"state_key": "state_value"}
|
37
37
|
_ANY_CURSOR_FIELD = ["a", "cursor", "key"]
|
38
38
|
_STREAM_NAME = "stream"
|
39
|
-
_ANY_CURSOR = Mock(spec=
|
39
|
+
_ANY_CURSOR = Mock(spec=FileBasedFinalStateCursor)
|
40
40
|
|
41
41
|
|
42
42
|
@pytest.mark.parametrize(
|
@@ -165,7 +165,7 @@ class StreamFacadeTest(unittest.TestCase):
|
|
165
165
|
supported_sync_modes=[SyncMode.full_refresh],
|
166
166
|
)
|
167
167
|
self._legacy_stream = DefaultFileBasedStream(
|
168
|
-
cursor=
|
168
|
+
cursor=FileBasedFinalStateCursor(stream_config=MagicMock(), stream_namespace=None, message_repository=Mock()),
|
169
169
|
config=FileBasedStreamConfig(name="stream", format=CsvFormat()),
|
170
170
|
catalog_schema={},
|
171
171
|
stream_reader=MagicMock(),
|
@@ -50,6 +50,7 @@ from unit_tests.sources.file_based.scenarios.concurrent_incremental_scenarios im
|
|
50
50
|
single_csv_no_input_state_scenario_concurrent,
|
51
51
|
)
|
52
52
|
from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
53
|
+
csv_analytics_scenario,
|
53
54
|
csv_autogenerate_column_names_scenario,
|
54
55
|
csv_custom_bool_values_scenario,
|
55
56
|
csv_custom_delimiter_in_double_quotes_scenario,
|
@@ -61,6 +62,7 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
|
61
62
|
csv_multi_stream_scenario,
|
62
63
|
csv_newline_in_values_not_quoted_scenario,
|
63
64
|
csv_newline_in_values_quoted_value_scenario,
|
65
|
+
csv_no_records_scenario,
|
64
66
|
csv_single_stream_scenario,
|
65
67
|
csv_skip_after_header_scenario,
|
66
68
|
csv_skip_before_and_after_header_scenario,
|
@@ -75,7 +77,6 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
|
75
77
|
invalid_csv_scenario,
|
76
78
|
multi_csv_scenario,
|
77
79
|
multi_csv_stream_n_file_exceeds_limit_for_inference,
|
78
|
-
multi_format_analytics_scenario,
|
79
80
|
multi_stream_custom_format,
|
80
81
|
schemaless_csv_multi_stream_scenario,
|
81
82
|
schemaless_csv_scenario,
|
@@ -152,7 +153,13 @@ from unit_tests.sources.file_based.scenarios.validation_policy_scenarios import
|
|
152
153
|
)
|
153
154
|
from unit_tests.sources.file_based.test_scenarios import verify_check, verify_discover, verify_read, verify_spec
|
154
155
|
|
155
|
-
|
156
|
+
discover_failure_scenarios = [
|
157
|
+
earlier_csv_scenario,
|
158
|
+
empty_schema_inference_scenario,
|
159
|
+
]
|
160
|
+
|
161
|
+
discover_success_scenarios = [
|
162
|
+
csv_no_records_scenario,
|
156
163
|
csv_multi_stream_scenario,
|
157
164
|
csv_single_stream_scenario,
|
158
165
|
invalid_csv_scenario,
|
@@ -176,9 +183,7 @@ discover_scenarios = [
|
|
176
183
|
single_csv_file_is_skipped_if_same_modified_at_as_in_history,
|
177
184
|
single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history,
|
178
185
|
csv_custom_format_scenario,
|
179
|
-
earlier_csv_scenario,
|
180
186
|
multi_stream_custom_format,
|
181
|
-
empty_schema_inference_scenario,
|
182
187
|
single_parquet_scenario,
|
183
188
|
multi_parquet_scenario,
|
184
189
|
parquet_various_types_scenario,
|
@@ -260,12 +265,14 @@ discover_scenarios = [
|
|
260
265
|
single_csv_no_input_state_scenario_concurrent,
|
261
266
|
]
|
262
267
|
|
263
|
-
|
268
|
+
discover_scenarios = discover_failure_scenarios + discover_success_scenarios
|
269
|
+
|
270
|
+
read_scenarios = discover_success_scenarios + [
|
264
271
|
emit_record_scenario_multi_stream,
|
265
272
|
emit_record_scenario_single_stream,
|
266
273
|
skip_record_scenario_multi_stream,
|
267
274
|
skip_record_scenario_single_stream,
|
268
|
-
|
275
|
+
csv_analytics_scenario,
|
269
276
|
wait_for_rediscovery_scenario_multi_stream,
|
270
277
|
wait_for_rediscovery_scenario_single_stream,
|
271
278
|
]
|
@@ -16,6 +16,7 @@ from airbyte_cdk.sources import AbstractSource
|
|
16
16
|
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor
|
17
17
|
from airbyte_cdk.test.entrypoint_wrapper import EntrypointOutput
|
18
18
|
from airbyte_cdk.test.entrypoint_wrapper import read as entrypoint_read
|
19
|
+
from airbyte_cdk.utils import message_utils
|
19
20
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
20
21
|
from airbyte_protocol.models import AirbyteLogMessage, AirbyteMessage, ConfiguredAirbyteCatalog
|
21
22
|
from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenario
|
@@ -71,7 +72,7 @@ def assert_exception(expected_exception: type[BaseException], output: Entrypoint
|
|
71
72
|
|
72
73
|
|
73
74
|
def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[AbstractSource]) -> None:
|
74
|
-
|
75
|
+
records_and_state_messages, log_messages = output.records_and_state_messages, output.logs
|
75
76
|
logs = [message.log for message in log_messages if message.log.level.value in scenario.log_levels]
|
76
77
|
if scenario.expected_records is None:
|
77
78
|
return
|
@@ -85,7 +86,7 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
|
|
85
86
|
),
|
86
87
|
)
|
87
88
|
sorted_records = sorted(
|
88
|
-
filter(lambda r: r.record,
|
89
|
+
filter(lambda r: r.record, records_and_state_messages),
|
89
90
|
key=lambda record: ",".join(
|
90
91
|
f"{k}={v}" for k, v in sorted(record.record.data.items(), key=lambda items: (items[0], items[1])) if k != "emitted_at"
|
91
92
|
),
|
@@ -104,7 +105,9 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
|
|
104
105
|
assert actual.record.stream == expected["stream"]
|
105
106
|
|
106
107
|
expected_states = list(filter(lambda e: "data" not in e, expected_records))
|
107
|
-
states = list(filter(lambda r: r.state,
|
108
|
+
states = list(filter(lambda r: r.state, records_and_state_messages))
|
109
|
+
assert len(states) > 0, "No state messages emitted. Successful syncs should emit at least one stream state."
|
110
|
+
_verify_state_record_counts(sorted_records, states)
|
108
111
|
|
109
112
|
if hasattr(scenario.source, "cursor_cls") and issubclass(scenario.source.cursor_cls, AbstractConcurrentFileBasedCursor):
|
110
113
|
# Only check the last state emitted because we don't know the order the others will be in.
|
@@ -125,8 +128,34 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
|
|
125
128
|
_verify_analytics(analytics, scenario.expected_analytics)
|
126
129
|
|
127
130
|
|
131
|
+
def _verify_state_record_counts(records: List[AirbyteMessage], states: List[AirbyteMessage]) -> None:
|
132
|
+
actual_record_counts = {}
|
133
|
+
for record in records:
|
134
|
+
stream_descriptor = message_utils.get_stream_descriptor(record)
|
135
|
+
actual_record_counts[stream_descriptor] = actual_record_counts.get(stream_descriptor, 0) + 1
|
136
|
+
|
137
|
+
state_record_count_sums = {}
|
138
|
+
for state_message in states:
|
139
|
+
stream_descriptor = message_utils.get_stream_descriptor(state_message)
|
140
|
+
state_record_count_sums[stream_descriptor] = (
|
141
|
+
state_record_count_sums.get(stream_descriptor, 0)
|
142
|
+
+ state_message.state.sourceStats.recordCount
|
143
|
+
)
|
144
|
+
|
145
|
+
for stream, actual_count in actual_record_counts.items():
|
146
|
+
assert state_record_count_sums.get(stream) == actual_count
|
147
|
+
|
148
|
+
# We can have extra keys in state_record_count_sums if we processed a stream and reported 0 records
|
149
|
+
extra_keys = state_record_count_sums.keys() - actual_record_counts.keys()
|
150
|
+
for stream in extra_keys:
|
151
|
+
assert state_record_count_sums[stream] == 0
|
152
|
+
|
153
|
+
|
128
154
|
def _verify_analytics(analytics: List[AirbyteMessage], expected_analytics: Optional[List[AirbyteAnalyticsTraceMessage]]) -> None:
|
129
155
|
if expected_analytics:
|
156
|
+
assert len(analytics) == len(
|
157
|
+
expected_analytics), \
|
158
|
+
f"Number of actual analytics messages ({len(analytics)}) did not match expected ({len(expected_analytics)})"
|
130
159
|
for actual, expected in zip(analytics, expected_analytics):
|
131
160
|
actual_type, actual_value = actual.trace.analytics.type, actual.trace.analytics.value
|
132
161
|
expected_type = expected.type
|
@@ -21,7 +21,7 @@ from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageReposi
|
|
21
21
|
from airbyte_cdk.sources.source import TState
|
22
22
|
from airbyte_cdk.sources.streams import Stream
|
23
23
|
from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
|
24
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField,
|
24
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField, FinalStateCursor
|
25
25
|
from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import EpochValueConcurrentStreamStateConverter
|
26
26
|
from airbyte_protocol.models import ConfiguredAirbyteStream
|
27
27
|
from unit_tests.sources.file_based.scenarios.scenario_builder import SourceBuilder
|
@@ -83,7 +83,7 @@ class StreamFacadeSource(ConcurrentSourceAdapter):
|
|
83
83
|
None,
|
84
84
|
)
|
85
85
|
if self._cursor_field
|
86
|
-
else
|
86
|
+
else FinalStateCursor(stream_name=stream.name, stream_namespace=stream.namespace, message_repository=self.message_repository),
|
87
87
|
)
|
88
88
|
for stream, state in zip(self._streams, stream_states)
|
89
89
|
]
|
@@ -4,7 +4,7 @@
|
|
4
4
|
import logging
|
5
5
|
|
6
6
|
from airbyte_cdk.sources.message import InMemoryMessageRepository
|
7
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import
|
7
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor
|
8
8
|
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
|
9
9
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
10
10
|
from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder
|
@@ -15,6 +15,8 @@ from unit_tests.sources.streams.concurrent.scenarios.thread_based_concurrent_str
|
|
15
15
|
InMemoryPartitionGenerator,
|
16
16
|
)
|
17
17
|
|
18
|
+
_message_repository = InMemoryMessageRepository()
|
19
|
+
|
18
20
|
_id_only_stream = DefaultStream(
|
19
21
|
partition_generator=InMemoryPartitionGenerator(
|
20
22
|
[InMemoryPartition("partition1", "stream1", None, [Record({"id": "1"}, "stream1"), Record({"id": "2"}, "stream1")])]
|
@@ -30,7 +32,7 @@ _id_only_stream = DefaultStream(
|
|
30
32
|
primary_key=[],
|
31
33
|
cursor_field=None,
|
32
34
|
logger=logging.getLogger("test_logger"),
|
33
|
-
cursor=
|
35
|
+
cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository),
|
34
36
|
)
|
35
37
|
|
36
38
|
_id_only_stream_with_slice_logger = DefaultStream(
|
@@ -48,7 +50,7 @@ _id_only_stream_with_slice_logger = DefaultStream(
|
|
48
50
|
primary_key=[],
|
49
51
|
cursor_field=None,
|
50
52
|
logger=logging.getLogger("test_logger"),
|
51
|
-
cursor=
|
53
|
+
cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository),
|
52
54
|
)
|
53
55
|
|
54
56
|
_id_only_stream_with_primary_key = DefaultStream(
|
@@ -66,7 +68,7 @@ _id_only_stream_with_primary_key = DefaultStream(
|
|
66
68
|
primary_key=["id"],
|
67
69
|
cursor_field=None,
|
68
70
|
logger=logging.getLogger("test_logger"),
|
69
|
-
cursor=
|
71
|
+
cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository),
|
70
72
|
)
|
71
73
|
|
72
74
|
_id_only_stream_multiple_partitions = DefaultStream(
|
@@ -87,7 +89,7 @@ _id_only_stream_multiple_partitions = DefaultStream(
|
|
87
89
|
primary_key=[],
|
88
90
|
cursor_field=None,
|
89
91
|
logger=logging.getLogger("test_logger"),
|
90
|
-
cursor=
|
92
|
+
cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository),
|
91
93
|
)
|
92
94
|
|
93
95
|
_id_only_stream_multiple_partitions_concurrency_level_two = DefaultStream(
|
@@ -108,7 +110,7 @@ _id_only_stream_multiple_partitions_concurrency_level_two = DefaultStream(
|
|
108
110
|
primary_key=[],
|
109
111
|
cursor_field=None,
|
110
112
|
logger=logging.getLogger("test_logger"),
|
111
|
-
cursor=
|
113
|
+
cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository),
|
112
114
|
)
|
113
115
|
|
114
116
|
_stream_raising_exception = DefaultStream(
|
@@ -126,7 +128,7 @@ _stream_raising_exception = DefaultStream(
|
|
126
128
|
primary_key=[],
|
127
129
|
cursor_field=None,
|
128
130
|
logger=logging.getLogger("test_logger"),
|
129
|
-
cursor=
|
131
|
+
cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository),
|
130
132
|
)
|
131
133
|
|
132
134
|
test_concurrent_cdk_single_stream = (
|
@@ -140,7 +142,7 @@ test_concurrent_cdk_single_stream = (
|
|
140
142
|
_id_only_stream,
|
141
143
|
]
|
142
144
|
)
|
143
|
-
.set_message_repository(
|
145
|
+
.set_message_repository(_message_repository)
|
144
146
|
)
|
145
147
|
.set_expected_records(
|
146
148
|
[
|
@@ -193,7 +195,7 @@ test_concurrent_cdk_single_stream_with_primary_key = (
|
|
193
195
|
_id_only_stream_with_primary_key,
|
194
196
|
]
|
195
197
|
)
|
196
|
-
.set_message_repository(
|
198
|
+
.set_message_repository(_message_repository)
|
197
199
|
)
|
198
200
|
.set_expected_records(
|
199
201
|
[
|
@@ -253,11 +255,11 @@ test_concurrent_cdk_multiple_streams = (
|
|
253
255
|
primary_key=[],
|
254
256
|
cursor_field=None,
|
255
257
|
logger=logging.getLogger("test_logger"),
|
256
|
-
cursor=
|
258
|
+
cursor=FinalStateCursor(stream_name="stream2", stream_namespace=None, message_repository=_message_repository),
|
257
259
|
),
|
258
260
|
]
|
259
261
|
)
|
260
|
-
.set_message_repository(
|
262
|
+
.set_message_repository(_message_repository)
|
261
263
|
)
|
262
264
|
.set_expected_records(
|
263
265
|
[
|
@@ -308,7 +310,7 @@ test_concurrent_cdk_partition_raises_exception = (
|
|
308
310
|
_stream_raising_exception,
|
309
311
|
]
|
310
312
|
)
|
311
|
-
.set_message_repository(
|
313
|
+
.set_message_repository(_message_repository)
|
312
314
|
)
|
313
315
|
.set_expected_records(
|
314
316
|
[
|
@@ -346,7 +348,7 @@ test_concurrent_cdk_single_stream_multiple_partitions = (
|
|
346
348
|
_id_only_stream_multiple_partitions,
|
347
349
|
]
|
348
350
|
)
|
349
|
-
.set_message_repository(
|
351
|
+
.set_message_repository(_message_repository)
|
350
352
|
)
|
351
353
|
.set_expected_records(
|
352
354
|
[
|
@@ -386,7 +388,7 @@ test_concurrent_cdk_single_stream_multiple_partitions_concurrency_level_two = (
|
|
386
388
|
_id_only_stream_multiple_partitions_concurrency_level_two,
|
387
389
|
]
|
388
390
|
)
|
389
|
-
.set_message_repository(
|
391
|
+
.set_message_repository(_message_repository)
|
390
392
|
)
|
391
393
|
.set_expected_records(
|
392
394
|
[
|
unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py
CHANGED
@@ -8,11 +8,11 @@ from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union
|
|
8
8
|
from airbyte_cdk.models import ConfiguredAirbyteCatalog, ConnectorSpecification, DestinationSyncMode, SyncMode
|
9
9
|
from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
|
10
10
|
from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
|
11
|
-
from airbyte_cdk.sources.message import MessageRepository
|
11
|
+
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
12
12
|
from airbyte_cdk.sources.streams import Stream
|
13
13
|
from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
|
14
14
|
from airbyte_cdk.sources.streams.concurrent.availability_strategy import AbstractAvailabilityStrategy, StreamAvailability, StreamAvailable
|
15
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import
|
15
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor
|
16
16
|
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
|
17
17
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
18
18
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
@@ -42,13 +42,14 @@ class ConcurrentCdkSource(ConcurrentSourceAdapter):
|
|
42
42
|
concurrent_source = ConcurrentSource.create(1, 1, streams[0]._logger, NeverLogSliceLogger(), message_repository)
|
43
43
|
super().__init__(concurrent_source)
|
44
44
|
self._streams = streams
|
45
|
+
self._message_repository = message_repository
|
45
46
|
|
46
47
|
def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
|
47
48
|
# Check is not verified because it is up to the source to implement this method
|
48
49
|
return True, None
|
49
50
|
|
50
51
|
def streams(self, config: Mapping[str, Any]) -> List[Stream]:
|
51
|
-
return [StreamFacade(s, LegacyStream(),
|
52
|
+
return [StreamFacade(s, LegacyStream(), FinalStateCursor(stream_name=s.name, stream_namespace=s.namespace, message_repository=self.message_repository), NeverLogSliceLogger(), s._logger) for s in self._streams]
|
52
53
|
|
53
54
|
def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
|
54
55
|
return ConnectorSpecification(connectionSpecification={})
|
@@ -57,7 +58,7 @@ class ConcurrentCdkSource(ConcurrentSourceAdapter):
|
|
57
58
|
return ConfiguredAirbyteCatalog(
|
58
59
|
streams=[
|
59
60
|
ConfiguredAirbyteStream(
|
60
|
-
stream=StreamFacade(s, LegacyStream(),
|
61
|
+
stream=StreamFacade(s, LegacyStream(), FinalStateCursor(stream_name=s.name, stream_namespace=s.namespace, message_repository=InMemoryMessageRepository()), NeverLogSliceLogger(), s._logger).as_airbyte_stream(),
|
61
62
|
sync_mode=SyncMode.full_refresh,
|
62
63
|
destination_sync_mode=DestinationSyncMode.overwrite,
|
63
64
|
)
|
@@ -5,8 +5,9 @@ import unittest
|
|
5
5
|
from unittest.mock import Mock
|
6
6
|
|
7
7
|
from airbyte_cdk.models import AirbyteStream, SyncMode
|
8
|
+
from airbyte_cdk.sources.message import InMemoryMessageRepository
|
8
9
|
from airbyte_cdk.sources.streams.concurrent.availability_strategy import STREAM_AVAILABLE
|
9
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor,
|
10
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor
|
10
11
|
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
|
11
12
|
|
12
13
|
|
@@ -20,6 +21,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
20
21
|
self._cursor_field = None
|
21
22
|
self._logger = Mock()
|
22
23
|
self._cursor = Mock(spec=Cursor)
|
24
|
+
self._message_repository = InMemoryMessageRepository()
|
23
25
|
self._stream = DefaultStream(
|
24
26
|
self._partition_generator,
|
25
27
|
self._name,
|
@@ -28,7 +30,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
28
30
|
self._primary_key,
|
29
31
|
self._cursor_field,
|
30
32
|
self._logger,
|
31
|
-
|
33
|
+
FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository),
|
32
34
|
)
|
33
35
|
|
34
36
|
def test_get_json_schema(self):
|
@@ -89,7 +91,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
89
91
|
["id"],
|
90
92
|
self._cursor_field,
|
91
93
|
self._logger,
|
92
|
-
|
94
|
+
FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository),
|
93
95
|
)
|
94
96
|
|
95
97
|
expected_airbyte_stream = AirbyteStream(
|
@@ -121,7 +123,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
121
123
|
["id_a", "id_b"],
|
122
124
|
self._cursor_field,
|
123
125
|
self._logger,
|
124
|
-
|
126
|
+
FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository),
|
125
127
|
)
|
126
128
|
|
127
129
|
expected_airbyte_stream = AirbyteStream(
|
@@ -153,7 +155,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
153
155
|
self._primary_key,
|
154
156
|
"date",
|
155
157
|
self._logger,
|
156
|
-
|
158
|
+
FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository),
|
157
159
|
)
|
158
160
|
|
159
161
|
expected_airbyte_stream = AirbyteStream(
|
@@ -178,7 +180,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
178
180
|
self._primary_key,
|
179
181
|
self._cursor_field,
|
180
182
|
self._logger,
|
181
|
-
|
183
|
+
FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository),
|
182
184
|
namespace="test",
|
183
185
|
)
|
184
186
|
expected_airbyte_stream = AirbyteStream(
|
@@ -26,7 +26,7 @@ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
|
26
26
|
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
27
27
|
from airbyte_cdk.sources.streams import Stream
|
28
28
|
from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
|
29
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor,
|
29
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor
|
30
30
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
31
31
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
32
32
|
from airbyte_cdk.sources.streams.core import StreamData
|
@@ -105,8 +105,9 @@ def _stream(slice_to_partition_mapping, slice_logger, logger, message_repository
|
|
105
105
|
return _MockStream(slice_to_partition_mapping)
|
106
106
|
|
107
107
|
|
108
|
-
def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor: Cursor =
|
108
|
+
def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor: Optional[Cursor] = None):
|
109
109
|
stream = _stream(slice_to_partition_mapping, slice_logger, logger, message_repository)
|
110
|
+
cursor = cursor or FinalStateCursor(stream_name=stream.name, stream_namespace=stream.namespace, message_repository=message_repository)
|
110
111
|
source = Mock()
|
111
112
|
source._slice_logger = slice_logger
|
112
113
|
source.message_repository = message_repository
|