airbyte-cdk 0.68.4__py3-none-any.whl → 0.69.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/entrypoint.py +27 -7
- airbyte_cdk/sources/connector_state_manager.py +0 -1
- airbyte_cdk/sources/file_based/file_based_source.py +4 -2
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +2 -2
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +2 -2
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/{file_based_noop_cursor.py → file_based_final_state_cursor.py} +21 -6
- airbyte_cdk/sources/streams/concurrent/adapters.py +2 -2
- airbyte_cdk/sources/streams/concurrent/cursor.py +27 -3
- airbyte_cdk/sources/streams/concurrent/default_stream.py +7 -3
- airbyte_cdk/test/entrypoint_wrapper.py +1 -1
- airbyte_cdk/utils/message_utils.py +17 -0
- {airbyte_cdk-0.68.4.dist-info → airbyte_cdk-0.69.1.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.68.4.dist-info → airbyte_cdk-0.69.1.dist-info}/RECORD +30 -28
- {airbyte_cdk-0.68.4.dist-info → airbyte_cdk-0.69.1.dist-info}/WHEEL +1 -1
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -2
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +128 -37
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +3 -3
- unit_tests/sources/file_based/test_file_based_scenarios.py +13 -6
- unit_tests/sources/file_based/test_scenarios.py +32 -3
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +2 -2
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +16 -14
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +5 -4
- unit_tests/sources/streams/concurrent/test_default_stream.py +8 -6
- unit_tests/sources/streams/test_stream_read.py +3 -2
- unit_tests/sources/test_concurrent_source.py +7 -5
- unit_tests/sources/test_source_read.py +2 -3
- unit_tests/test/test_entrypoint_wrapper.py +9 -6
- unit_tests/utils/test_message_utils.py +91 -0
- {airbyte_cdk-0.68.4.dist-info → airbyte_cdk-0.69.1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.68.4.dist-info → airbyte_cdk-0.69.1.dist-info}/top_level.txt +0 -0
@@ -467,30 +467,24 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
|
|
467
467
|
)
|
468
468
|
).build()
|
469
469
|
|
470
|
-
|
470
|
+
csv_analytics_scenario: TestScenario[InMemoryFilesSource] = (
|
471
471
|
TestScenarioBuilder[InMemoryFilesSource]()
|
472
|
-
.set_name("
|
472
|
+
.set_name("csv_analytics")
|
473
473
|
.set_config(
|
474
474
|
{
|
475
475
|
"streams": [
|
476
476
|
{
|
477
477
|
"name": "stream1",
|
478
478
|
"format": {"filetype": "csv"},
|
479
|
-
"globs": ["
|
479
|
+
"globs": ["a.csv"],
|
480
480
|
"validation_policy": "Emit Record",
|
481
481
|
},
|
482
482
|
{
|
483
483
|
"name": "stream2",
|
484
484
|
"format": {"filetype": "csv"},
|
485
|
-
"globs": ["
|
486
|
-
"validation_policy": "Emit Record",
|
487
|
-
},
|
488
|
-
{
|
489
|
-
"name": "stream3",
|
490
|
-
"format": {"filetype": "jsonl"},
|
491
|
-
"globs": ["file3.jsonl"],
|
485
|
+
"globs": ["b.csv"],
|
492
486
|
"validation_policy": "Emit Record",
|
493
|
-
}
|
487
|
+
}
|
494
488
|
]
|
495
489
|
}
|
496
490
|
)
|
@@ -498,17 +492,21 @@ multi_format_analytics_scenario: TestScenario[InMemoryFilesSource] = (
|
|
498
492
|
FileBasedSourceBuilder()
|
499
493
|
.set_files(
|
500
494
|
{
|
501
|
-
"
|
502
|
-
"contents": [
|
495
|
+
"a.csv": {
|
496
|
+
"contents": [
|
497
|
+
("col1", "col2"),
|
498
|
+
("val11a", "val12a"),
|
499
|
+
("val21a", "val22a"),
|
500
|
+
],
|
503
501
|
"last_modified": "2023-06-05T03:54:07.000Z",
|
504
502
|
},
|
505
|
-
"
|
506
|
-
"contents": [
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
"last_modified": "2023-06-
|
503
|
+
"b.csv": {
|
504
|
+
"contents": [
|
505
|
+
("col1", "col2", "col3"),
|
506
|
+
("val11b", "val12b", "val13b"),
|
507
|
+
("val21b", "val22b", "val23b"),
|
508
|
+
],
|
509
|
+
"last_modified": "2023-06-05T03:54:07.000Z",
|
512
510
|
},
|
513
511
|
}
|
514
512
|
)
|
@@ -521,7 +519,12 @@ multi_format_analytics_scenario: TestScenario[InMemoryFilesSource] = (
|
|
521
519
|
"default_cursor_field": ["_ab_source_file_last_modified"],
|
522
520
|
"json_schema": {
|
523
521
|
"type": "object",
|
524
|
-
"properties": {
|
522
|
+
"properties": {
|
523
|
+
"col1": {"type": ["null", "string"]},
|
524
|
+
"col2": {"type": ["null", "string"]},
|
525
|
+
"_ab_source_file_last_modified": {"type": "string"},
|
526
|
+
"_ab_source_file_url": {"type": "string"},
|
527
|
+
},
|
525
528
|
},
|
526
529
|
"name": "stream1",
|
527
530
|
"source_defined_cursor": True,
|
@@ -531,30 +534,64 @@ multi_format_analytics_scenario: TestScenario[InMemoryFilesSource] = (
|
|
531
534
|
"default_cursor_field": ["_ab_source_file_last_modified"],
|
532
535
|
"json_schema": {
|
533
536
|
"type": "object",
|
534
|
-
"properties": {
|
537
|
+
"properties": {
|
538
|
+
"col1": {"type": ["null", "string"]},
|
539
|
+
"col2": {"type": ["null", "string"]},
|
540
|
+
"col3": {"type": ["null", "string"]},
|
541
|
+
"_ab_source_file_last_modified": {"type": "string"},
|
542
|
+
"_ab_source_file_url": {"type": "string"},
|
543
|
+
},
|
535
544
|
},
|
536
545
|
"name": "stream2",
|
537
546
|
"source_defined_cursor": True,
|
538
547
|
"supported_sync_modes": ["full_refresh", "incremental"],
|
539
|
-
}
|
540
|
-
{
|
541
|
-
"default_cursor_field": ["_ab_source_file_last_modified"],
|
542
|
-
"json_schema": {
|
543
|
-
"type": "object",
|
544
|
-
"properties": {},
|
545
|
-
},
|
546
|
-
"name": "stream3",
|
547
|
-
"source_defined_cursor": True,
|
548
|
-
"supported_sync_modes": ["full_refresh", "incremental"],
|
549
|
-
},
|
548
|
+
}
|
550
549
|
]
|
551
550
|
}
|
552
551
|
)
|
553
|
-
.set_expected_records([
|
552
|
+
.set_expected_records([
|
553
|
+
{
|
554
|
+
"data": {
|
555
|
+
"col1": "val11a",
|
556
|
+
"col2": "val12a",
|
557
|
+
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
558
|
+
"_ab_source_file_url": "a.csv",
|
559
|
+
},
|
560
|
+
"stream": "stream1",
|
561
|
+
},
|
562
|
+
{
|
563
|
+
"data": {
|
564
|
+
"col1": "val21a",
|
565
|
+
"col2": "val22a",
|
566
|
+
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
567
|
+
"_ab_source_file_url": "a.csv",
|
568
|
+
},
|
569
|
+
"stream": "stream1",
|
570
|
+
},
|
571
|
+
{
|
572
|
+
"data": {
|
573
|
+
"col1": "val11b",
|
574
|
+
"col2": "val12b",
|
575
|
+
"col3": "val13b",
|
576
|
+
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
577
|
+
"_ab_source_file_url": "b.csv",
|
578
|
+
},
|
579
|
+
"stream": "stream2",
|
580
|
+
},
|
581
|
+
{
|
582
|
+
"data": {
|
583
|
+
"col1": "val21b",
|
584
|
+
"col2": "val22b",
|
585
|
+
"col3": "val23b",
|
586
|
+
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
587
|
+
"_ab_source_file_url": "b.csv",
|
588
|
+
},
|
589
|
+
"stream": "stream2",
|
590
|
+
},
|
591
|
+
])
|
554
592
|
.set_expected_analytics(
|
555
593
|
[
|
556
594
|
AirbyteAnalyticsTraceMessage(type="file-cdk-csv-stream-count", value="2"),
|
557
|
-
AirbyteAnalyticsTraceMessage(type="file-cdk-jsonl-stream-count", value="1"),
|
558
595
|
]
|
559
596
|
)
|
560
597
|
).build()
|
@@ -1450,7 +1487,6 @@ empty_schema_inference_scenario: TestScenario[InMemoryFilesSource] = (
|
|
1450
1487
|
}
|
1451
1488
|
)
|
1452
1489
|
.set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
|
1453
|
-
.set_expected_records([])
|
1454
1490
|
).build()
|
1455
1491
|
|
1456
1492
|
schemaless_csv_scenario: TestScenario[InMemoryFilesSource] = (
|
@@ -3009,6 +3045,61 @@ earlier_csv_scenario: TestScenario[InMemoryFilesSource] = (
|
|
3009
3045
|
]
|
3010
3046
|
}
|
3011
3047
|
)
|
3012
|
-
.set_expected_records([])
|
3013
3048
|
.set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
|
3014
3049
|
).build()
|
3050
|
+
|
3051
|
+
csv_no_records_scenario: TestScenario[InMemoryFilesSource] = (
|
3052
|
+
TestScenarioBuilder[InMemoryFilesSource]()
|
3053
|
+
.set_name("csv_empty_no_records")
|
3054
|
+
.set_config(
|
3055
|
+
{
|
3056
|
+
"streams": [
|
3057
|
+
{
|
3058
|
+
"name": "stream1",
|
3059
|
+
"globs": ["*"],
|
3060
|
+
"validation_policy": "Emit Record",
|
3061
|
+
"input_schema": '{"col1": "boolean", "col2": "string"}',
|
3062
|
+
"format": {
|
3063
|
+
"filetype": "csv",
|
3064
|
+
"null_values": ["null"],
|
3065
|
+
},
|
3066
|
+
}
|
3067
|
+
],
|
3068
|
+
"start_date": "2023-06-04T03:54:07.000000Z",
|
3069
|
+
}
|
3070
|
+
)
|
3071
|
+
.set_source_builder(
|
3072
|
+
FileBasedSourceBuilder()
|
3073
|
+
.set_files(
|
3074
|
+
{
|
3075
|
+
"a.csv": {
|
3076
|
+
"contents": [("col1", "col2")], # column headers, but no data rows
|
3077
|
+
"last_modified": "2023-06-05T03:54:07.000Z",
|
3078
|
+
}
|
3079
|
+
}
|
3080
|
+
)
|
3081
|
+
.set_file_type("csv")
|
3082
|
+
)
|
3083
|
+
.set_expected_catalog(
|
3084
|
+
{
|
3085
|
+
"streams": [
|
3086
|
+
{
|
3087
|
+
"default_cursor_field": ["_ab_source_file_last_modified"],
|
3088
|
+
"json_schema": {
|
3089
|
+
"type": "object",
|
3090
|
+
"properties": {
|
3091
|
+
"col1": {"type": "boolean"},
|
3092
|
+
"col2": {"type": "string"},
|
3093
|
+
"_ab_source_file_last_modified": {"type": "string"},
|
3094
|
+
"_ab_source_file_url": {"type": "string"},
|
3095
|
+
},
|
3096
|
+
},
|
3097
|
+
"name": "stream1",
|
3098
|
+
"source_defined_cursor": True,
|
3099
|
+
"supported_sync_modes": ["full_refresh", "incremental"],
|
3100
|
+
}
|
3101
|
+
]
|
3102
|
+
}
|
3103
|
+
)
|
3104
|
+
.set_expected_records([])
|
3105
|
+
).build()
|
@@ -23,7 +23,7 @@ from airbyte_cdk.sources.file_based.stream.concurrent.adapters import (
|
|
23
23
|
FileBasedStreamPartition,
|
24
24
|
FileBasedStreamPartitionGenerator,
|
25
25
|
)
|
26
|
-
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import
|
26
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedFinalStateCursor
|
27
27
|
from airbyte_cdk.sources.message import InMemoryMessageRepository
|
28
28
|
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
|
29
29
|
from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage
|
@@ -36,7 +36,7 @@ _ANY_SYNC_MODE = SyncMode.full_refresh
|
|
36
36
|
_ANY_STATE = {"state_key": "state_value"}
|
37
37
|
_ANY_CURSOR_FIELD = ["a", "cursor", "key"]
|
38
38
|
_STREAM_NAME = "stream"
|
39
|
-
_ANY_CURSOR = Mock(spec=
|
39
|
+
_ANY_CURSOR = Mock(spec=FileBasedFinalStateCursor)
|
40
40
|
|
41
41
|
|
42
42
|
@pytest.mark.parametrize(
|
@@ -165,7 +165,7 @@ class StreamFacadeTest(unittest.TestCase):
|
|
165
165
|
supported_sync_modes=[SyncMode.full_refresh],
|
166
166
|
)
|
167
167
|
self._legacy_stream = DefaultFileBasedStream(
|
168
|
-
cursor=
|
168
|
+
cursor=FileBasedFinalStateCursor(stream_config=MagicMock(), stream_namespace=None, message_repository=Mock()),
|
169
169
|
config=FileBasedStreamConfig(name="stream", format=CsvFormat()),
|
170
170
|
catalog_schema={},
|
171
171
|
stream_reader=MagicMock(),
|
@@ -50,6 +50,7 @@ from unit_tests.sources.file_based.scenarios.concurrent_incremental_scenarios im
|
|
50
50
|
single_csv_no_input_state_scenario_concurrent,
|
51
51
|
)
|
52
52
|
from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
53
|
+
csv_analytics_scenario,
|
53
54
|
csv_autogenerate_column_names_scenario,
|
54
55
|
csv_custom_bool_values_scenario,
|
55
56
|
csv_custom_delimiter_in_double_quotes_scenario,
|
@@ -61,6 +62,7 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
|
61
62
|
csv_multi_stream_scenario,
|
62
63
|
csv_newline_in_values_not_quoted_scenario,
|
63
64
|
csv_newline_in_values_quoted_value_scenario,
|
65
|
+
csv_no_records_scenario,
|
64
66
|
csv_single_stream_scenario,
|
65
67
|
csv_skip_after_header_scenario,
|
66
68
|
csv_skip_before_and_after_header_scenario,
|
@@ -75,7 +77,6 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
|
75
77
|
invalid_csv_scenario,
|
76
78
|
multi_csv_scenario,
|
77
79
|
multi_csv_stream_n_file_exceeds_limit_for_inference,
|
78
|
-
multi_format_analytics_scenario,
|
79
80
|
multi_stream_custom_format,
|
80
81
|
schemaless_csv_multi_stream_scenario,
|
81
82
|
schemaless_csv_scenario,
|
@@ -152,7 +153,13 @@ from unit_tests.sources.file_based.scenarios.validation_policy_scenarios import
|
|
152
153
|
)
|
153
154
|
from unit_tests.sources.file_based.test_scenarios import verify_check, verify_discover, verify_read, verify_spec
|
154
155
|
|
155
|
-
|
156
|
+
discover_failure_scenarios = [
|
157
|
+
earlier_csv_scenario,
|
158
|
+
empty_schema_inference_scenario,
|
159
|
+
]
|
160
|
+
|
161
|
+
discover_success_scenarios = [
|
162
|
+
csv_no_records_scenario,
|
156
163
|
csv_multi_stream_scenario,
|
157
164
|
csv_single_stream_scenario,
|
158
165
|
invalid_csv_scenario,
|
@@ -176,9 +183,7 @@ discover_scenarios = [
|
|
176
183
|
single_csv_file_is_skipped_if_same_modified_at_as_in_history,
|
177
184
|
single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history,
|
178
185
|
csv_custom_format_scenario,
|
179
|
-
earlier_csv_scenario,
|
180
186
|
multi_stream_custom_format,
|
181
|
-
empty_schema_inference_scenario,
|
182
187
|
single_parquet_scenario,
|
183
188
|
multi_parquet_scenario,
|
184
189
|
parquet_various_types_scenario,
|
@@ -260,12 +265,14 @@ discover_scenarios = [
|
|
260
265
|
single_csv_no_input_state_scenario_concurrent,
|
261
266
|
]
|
262
267
|
|
263
|
-
|
268
|
+
discover_scenarios = discover_failure_scenarios + discover_success_scenarios
|
269
|
+
|
270
|
+
read_scenarios = discover_success_scenarios + [
|
264
271
|
emit_record_scenario_multi_stream,
|
265
272
|
emit_record_scenario_single_stream,
|
266
273
|
skip_record_scenario_multi_stream,
|
267
274
|
skip_record_scenario_single_stream,
|
268
|
-
|
275
|
+
csv_analytics_scenario,
|
269
276
|
wait_for_rediscovery_scenario_multi_stream,
|
270
277
|
wait_for_rediscovery_scenario_single_stream,
|
271
278
|
]
|
@@ -16,6 +16,7 @@ from airbyte_cdk.sources import AbstractSource
|
|
16
16
|
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor
|
17
17
|
from airbyte_cdk.test.entrypoint_wrapper import EntrypointOutput
|
18
18
|
from airbyte_cdk.test.entrypoint_wrapper import read as entrypoint_read
|
19
|
+
from airbyte_cdk.utils import message_utils
|
19
20
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
20
21
|
from airbyte_protocol.models import AirbyteLogMessage, AirbyteMessage, ConfiguredAirbyteCatalog
|
21
22
|
from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenario
|
@@ -71,7 +72,7 @@ def assert_exception(expected_exception: type[BaseException], output: Entrypoint
|
|
71
72
|
|
72
73
|
|
73
74
|
def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[AbstractSource]) -> None:
|
74
|
-
|
75
|
+
records_and_state_messages, log_messages = output.records_and_state_messages, output.logs
|
75
76
|
logs = [message.log for message in log_messages if message.log.level.value in scenario.log_levels]
|
76
77
|
if scenario.expected_records is None:
|
77
78
|
return
|
@@ -85,7 +86,7 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
|
|
85
86
|
),
|
86
87
|
)
|
87
88
|
sorted_records = sorted(
|
88
|
-
filter(lambda r: r.record,
|
89
|
+
filter(lambda r: r.record, records_and_state_messages),
|
89
90
|
key=lambda record: ",".join(
|
90
91
|
f"{k}={v}" for k, v in sorted(record.record.data.items(), key=lambda items: (items[0], items[1])) if k != "emitted_at"
|
91
92
|
),
|
@@ -104,7 +105,9 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
|
|
104
105
|
assert actual.record.stream == expected["stream"]
|
105
106
|
|
106
107
|
expected_states = list(filter(lambda e: "data" not in e, expected_records))
|
107
|
-
states = list(filter(lambda r: r.state,
|
108
|
+
states = list(filter(lambda r: r.state, records_and_state_messages))
|
109
|
+
assert len(states) > 0, "No state messages emitted. Successful syncs should emit at least one stream state."
|
110
|
+
_verify_state_record_counts(sorted_records, states)
|
108
111
|
|
109
112
|
if hasattr(scenario.source, "cursor_cls") and issubclass(scenario.source.cursor_cls, AbstractConcurrentFileBasedCursor):
|
110
113
|
# Only check the last state emitted because we don't know the order the others will be in.
|
@@ -125,8 +128,34 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
|
|
125
128
|
_verify_analytics(analytics, scenario.expected_analytics)
|
126
129
|
|
127
130
|
|
131
|
+
def _verify_state_record_counts(records: List[AirbyteMessage], states: List[AirbyteMessage]) -> None:
|
132
|
+
actual_record_counts = {}
|
133
|
+
for record in records:
|
134
|
+
stream_descriptor = message_utils.get_stream_descriptor(record)
|
135
|
+
actual_record_counts[stream_descriptor] = actual_record_counts.get(stream_descriptor, 0) + 1
|
136
|
+
|
137
|
+
state_record_count_sums = {}
|
138
|
+
for state_message in states:
|
139
|
+
stream_descriptor = message_utils.get_stream_descriptor(state_message)
|
140
|
+
state_record_count_sums[stream_descriptor] = (
|
141
|
+
state_record_count_sums.get(stream_descriptor, 0)
|
142
|
+
+ state_message.state.sourceStats.recordCount
|
143
|
+
)
|
144
|
+
|
145
|
+
for stream, actual_count in actual_record_counts.items():
|
146
|
+
assert state_record_count_sums.get(stream) == actual_count
|
147
|
+
|
148
|
+
# We can have extra keys in state_record_count_sums if we processed a stream and reported 0 records
|
149
|
+
extra_keys = state_record_count_sums.keys() - actual_record_counts.keys()
|
150
|
+
for stream in extra_keys:
|
151
|
+
assert state_record_count_sums[stream] == 0
|
152
|
+
|
153
|
+
|
128
154
|
def _verify_analytics(analytics: List[AirbyteMessage], expected_analytics: Optional[List[AirbyteAnalyticsTraceMessage]]) -> None:
|
129
155
|
if expected_analytics:
|
156
|
+
assert len(analytics) == len(
|
157
|
+
expected_analytics), \
|
158
|
+
f"Number of actual analytics messages ({len(analytics)}) did not match expected ({len(expected_analytics)})"
|
130
159
|
for actual, expected in zip(analytics, expected_analytics):
|
131
160
|
actual_type, actual_value = actual.trace.analytics.type, actual.trace.analytics.value
|
132
161
|
expected_type = expected.type
|
@@ -21,7 +21,7 @@ from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageReposi
|
|
21
21
|
from airbyte_cdk.sources.source import TState
|
22
22
|
from airbyte_cdk.sources.streams import Stream
|
23
23
|
from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
|
24
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField,
|
24
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField, FinalStateCursor
|
25
25
|
from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import EpochValueConcurrentStreamStateConverter
|
26
26
|
from airbyte_protocol.models import ConfiguredAirbyteStream
|
27
27
|
from unit_tests.sources.file_based.scenarios.scenario_builder import SourceBuilder
|
@@ -83,7 +83,7 @@ class StreamFacadeSource(ConcurrentSourceAdapter):
|
|
83
83
|
None,
|
84
84
|
)
|
85
85
|
if self._cursor_field
|
86
|
-
else
|
86
|
+
else FinalStateCursor(stream_name=stream.name, stream_namespace=stream.namespace, message_repository=self.message_repository),
|
87
87
|
)
|
88
88
|
for stream, state in zip(self._streams, stream_states)
|
89
89
|
]
|
@@ -4,7 +4,7 @@
|
|
4
4
|
import logging
|
5
5
|
|
6
6
|
from airbyte_cdk.sources.message import InMemoryMessageRepository
|
7
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import
|
7
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor
|
8
8
|
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
|
9
9
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
10
10
|
from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder
|
@@ -15,6 +15,8 @@ from unit_tests.sources.streams.concurrent.scenarios.thread_based_concurrent_str
|
|
15
15
|
InMemoryPartitionGenerator,
|
16
16
|
)
|
17
17
|
|
18
|
+
_message_repository = InMemoryMessageRepository()
|
19
|
+
|
18
20
|
_id_only_stream = DefaultStream(
|
19
21
|
partition_generator=InMemoryPartitionGenerator(
|
20
22
|
[InMemoryPartition("partition1", "stream1", None, [Record({"id": "1"}, "stream1"), Record({"id": "2"}, "stream1")])]
|
@@ -30,7 +32,7 @@ _id_only_stream = DefaultStream(
|
|
30
32
|
primary_key=[],
|
31
33
|
cursor_field=None,
|
32
34
|
logger=logging.getLogger("test_logger"),
|
33
|
-
cursor=
|
35
|
+
cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository),
|
34
36
|
)
|
35
37
|
|
36
38
|
_id_only_stream_with_slice_logger = DefaultStream(
|
@@ -48,7 +50,7 @@ _id_only_stream_with_slice_logger = DefaultStream(
|
|
48
50
|
primary_key=[],
|
49
51
|
cursor_field=None,
|
50
52
|
logger=logging.getLogger("test_logger"),
|
51
|
-
cursor=
|
53
|
+
cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository),
|
52
54
|
)
|
53
55
|
|
54
56
|
_id_only_stream_with_primary_key = DefaultStream(
|
@@ -66,7 +68,7 @@ _id_only_stream_with_primary_key = DefaultStream(
|
|
66
68
|
primary_key=["id"],
|
67
69
|
cursor_field=None,
|
68
70
|
logger=logging.getLogger("test_logger"),
|
69
|
-
cursor=
|
71
|
+
cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository),
|
70
72
|
)
|
71
73
|
|
72
74
|
_id_only_stream_multiple_partitions = DefaultStream(
|
@@ -87,7 +89,7 @@ _id_only_stream_multiple_partitions = DefaultStream(
|
|
87
89
|
primary_key=[],
|
88
90
|
cursor_field=None,
|
89
91
|
logger=logging.getLogger("test_logger"),
|
90
|
-
cursor=
|
92
|
+
cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository),
|
91
93
|
)
|
92
94
|
|
93
95
|
_id_only_stream_multiple_partitions_concurrency_level_two = DefaultStream(
|
@@ -108,7 +110,7 @@ _id_only_stream_multiple_partitions_concurrency_level_two = DefaultStream(
|
|
108
110
|
primary_key=[],
|
109
111
|
cursor_field=None,
|
110
112
|
logger=logging.getLogger("test_logger"),
|
111
|
-
cursor=
|
113
|
+
cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository),
|
112
114
|
)
|
113
115
|
|
114
116
|
_stream_raising_exception = DefaultStream(
|
@@ -126,7 +128,7 @@ _stream_raising_exception = DefaultStream(
|
|
126
128
|
primary_key=[],
|
127
129
|
cursor_field=None,
|
128
130
|
logger=logging.getLogger("test_logger"),
|
129
|
-
cursor=
|
131
|
+
cursor=FinalStateCursor(stream_name="stream1", stream_namespace=None, message_repository=_message_repository),
|
130
132
|
)
|
131
133
|
|
132
134
|
test_concurrent_cdk_single_stream = (
|
@@ -140,7 +142,7 @@ test_concurrent_cdk_single_stream = (
|
|
140
142
|
_id_only_stream,
|
141
143
|
]
|
142
144
|
)
|
143
|
-
.set_message_repository(
|
145
|
+
.set_message_repository(_message_repository)
|
144
146
|
)
|
145
147
|
.set_expected_records(
|
146
148
|
[
|
@@ -193,7 +195,7 @@ test_concurrent_cdk_single_stream_with_primary_key = (
|
|
193
195
|
_id_only_stream_with_primary_key,
|
194
196
|
]
|
195
197
|
)
|
196
|
-
.set_message_repository(
|
198
|
+
.set_message_repository(_message_repository)
|
197
199
|
)
|
198
200
|
.set_expected_records(
|
199
201
|
[
|
@@ -253,11 +255,11 @@ test_concurrent_cdk_multiple_streams = (
|
|
253
255
|
primary_key=[],
|
254
256
|
cursor_field=None,
|
255
257
|
logger=logging.getLogger("test_logger"),
|
256
|
-
cursor=
|
258
|
+
cursor=FinalStateCursor(stream_name="stream2", stream_namespace=None, message_repository=_message_repository),
|
257
259
|
),
|
258
260
|
]
|
259
261
|
)
|
260
|
-
.set_message_repository(
|
262
|
+
.set_message_repository(_message_repository)
|
261
263
|
)
|
262
264
|
.set_expected_records(
|
263
265
|
[
|
@@ -308,7 +310,7 @@ test_concurrent_cdk_partition_raises_exception = (
|
|
308
310
|
_stream_raising_exception,
|
309
311
|
]
|
310
312
|
)
|
311
|
-
.set_message_repository(
|
313
|
+
.set_message_repository(_message_repository)
|
312
314
|
)
|
313
315
|
.set_expected_records(
|
314
316
|
[
|
@@ -346,7 +348,7 @@ test_concurrent_cdk_single_stream_multiple_partitions = (
|
|
346
348
|
_id_only_stream_multiple_partitions,
|
347
349
|
]
|
348
350
|
)
|
349
|
-
.set_message_repository(
|
351
|
+
.set_message_repository(_message_repository)
|
350
352
|
)
|
351
353
|
.set_expected_records(
|
352
354
|
[
|
@@ -386,7 +388,7 @@ test_concurrent_cdk_single_stream_multiple_partitions_concurrency_level_two = (
|
|
386
388
|
_id_only_stream_multiple_partitions_concurrency_level_two,
|
387
389
|
]
|
388
390
|
)
|
389
|
-
.set_message_repository(
|
391
|
+
.set_message_repository(_message_repository)
|
390
392
|
)
|
391
393
|
.set_expected_records(
|
392
394
|
[
|
unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py
CHANGED
@@ -8,11 +8,11 @@ from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union
|
|
8
8
|
from airbyte_cdk.models import ConfiguredAirbyteCatalog, ConnectorSpecification, DestinationSyncMode, SyncMode
|
9
9
|
from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
|
10
10
|
from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
|
11
|
-
from airbyte_cdk.sources.message import MessageRepository
|
11
|
+
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
12
12
|
from airbyte_cdk.sources.streams import Stream
|
13
13
|
from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
|
14
14
|
from airbyte_cdk.sources.streams.concurrent.availability_strategy import AbstractAvailabilityStrategy, StreamAvailability, StreamAvailable
|
15
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import
|
15
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor
|
16
16
|
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
|
17
17
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
18
18
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
@@ -42,13 +42,14 @@ class ConcurrentCdkSource(ConcurrentSourceAdapter):
|
|
42
42
|
concurrent_source = ConcurrentSource.create(1, 1, streams[0]._logger, NeverLogSliceLogger(), message_repository)
|
43
43
|
super().__init__(concurrent_source)
|
44
44
|
self._streams = streams
|
45
|
+
self._message_repository = message_repository
|
45
46
|
|
46
47
|
def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
|
47
48
|
# Check is not verified because it is up to the source to implement this method
|
48
49
|
return True, None
|
49
50
|
|
50
51
|
def streams(self, config: Mapping[str, Any]) -> List[Stream]:
|
51
|
-
return [StreamFacade(s, LegacyStream(),
|
52
|
+
return [StreamFacade(s, LegacyStream(), FinalStateCursor(stream_name=s.name, stream_namespace=s.namespace, message_repository=self.message_repository), NeverLogSliceLogger(), s._logger) for s in self._streams]
|
52
53
|
|
53
54
|
def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
|
54
55
|
return ConnectorSpecification(connectionSpecification={})
|
@@ -57,7 +58,7 @@ class ConcurrentCdkSource(ConcurrentSourceAdapter):
|
|
57
58
|
return ConfiguredAirbyteCatalog(
|
58
59
|
streams=[
|
59
60
|
ConfiguredAirbyteStream(
|
60
|
-
stream=StreamFacade(s, LegacyStream(),
|
61
|
+
stream=StreamFacade(s, LegacyStream(), FinalStateCursor(stream_name=s.name, stream_namespace=s.namespace, message_repository=InMemoryMessageRepository()), NeverLogSliceLogger(), s._logger).as_airbyte_stream(),
|
61
62
|
sync_mode=SyncMode.full_refresh,
|
62
63
|
destination_sync_mode=DestinationSyncMode.overwrite,
|
63
64
|
)
|
@@ -5,8 +5,9 @@ import unittest
|
|
5
5
|
from unittest.mock import Mock
|
6
6
|
|
7
7
|
from airbyte_cdk.models import AirbyteStream, SyncMode
|
8
|
+
from airbyte_cdk.sources.message import InMemoryMessageRepository
|
8
9
|
from airbyte_cdk.sources.streams.concurrent.availability_strategy import STREAM_AVAILABLE
|
9
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor,
|
10
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor
|
10
11
|
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
|
11
12
|
|
12
13
|
|
@@ -20,6 +21,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
20
21
|
self._cursor_field = None
|
21
22
|
self._logger = Mock()
|
22
23
|
self._cursor = Mock(spec=Cursor)
|
24
|
+
self._message_repository = InMemoryMessageRepository()
|
23
25
|
self._stream = DefaultStream(
|
24
26
|
self._partition_generator,
|
25
27
|
self._name,
|
@@ -28,7 +30,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
28
30
|
self._primary_key,
|
29
31
|
self._cursor_field,
|
30
32
|
self._logger,
|
31
|
-
|
33
|
+
FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository),
|
32
34
|
)
|
33
35
|
|
34
36
|
def test_get_json_schema(self):
|
@@ -89,7 +91,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
89
91
|
["id"],
|
90
92
|
self._cursor_field,
|
91
93
|
self._logger,
|
92
|
-
|
94
|
+
FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository),
|
93
95
|
)
|
94
96
|
|
95
97
|
expected_airbyte_stream = AirbyteStream(
|
@@ -121,7 +123,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
121
123
|
["id_a", "id_b"],
|
122
124
|
self._cursor_field,
|
123
125
|
self._logger,
|
124
|
-
|
126
|
+
FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository),
|
125
127
|
)
|
126
128
|
|
127
129
|
expected_airbyte_stream = AirbyteStream(
|
@@ -153,7 +155,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
153
155
|
self._primary_key,
|
154
156
|
"date",
|
155
157
|
self._logger,
|
156
|
-
|
158
|
+
FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository),
|
157
159
|
)
|
158
160
|
|
159
161
|
expected_airbyte_stream = AirbyteStream(
|
@@ -178,7 +180,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
178
180
|
self._primary_key,
|
179
181
|
self._cursor_field,
|
180
182
|
self._logger,
|
181
|
-
|
183
|
+
FinalStateCursor(stream_name=self._name, stream_namespace=None, message_repository=self._message_repository),
|
182
184
|
namespace="test",
|
183
185
|
)
|
184
186
|
expected_airbyte_stream = AirbyteStream(
|
@@ -26,7 +26,7 @@ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
|
26
26
|
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
27
27
|
from airbyte_cdk.sources.streams import Stream
|
28
28
|
from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
|
29
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor,
|
29
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor
|
30
30
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
31
31
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
32
32
|
from airbyte_cdk.sources.streams.core import StreamData
|
@@ -105,8 +105,9 @@ def _stream(slice_to_partition_mapping, slice_logger, logger, message_repository
|
|
105
105
|
return _MockStream(slice_to_partition_mapping)
|
106
106
|
|
107
107
|
|
108
|
-
def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor: Cursor =
|
108
|
+
def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor: Optional[Cursor] = None):
|
109
109
|
stream = _stream(slice_to_partition_mapping, slice_logger, logger, message_repository)
|
110
|
+
cursor = cursor or FinalStateCursor(stream_name=stream.name, stream_namespace=stream.namespace, message_repository=message_repository)
|
110
111
|
source = Mock()
|
111
112
|
source._slice_logger = slice_logger
|
112
113
|
source.message_repository = message_repository
|