airbyte-cdk 0.50.0__py3-none-any.whl → 0.50.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. airbyte_cdk/entrypoint.py +7 -0
  2. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3 -3
  3. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +3 -3
  4. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +9 -9
  5. airbyte_cdk/sources/file_based/config/csv_format.py +42 -6
  6. airbyte_cdk/sources/file_based/file_based_source.py +4 -5
  7. airbyte_cdk/sources/file_based/file_types/csv_parser.py +114 -59
  8. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +2 -2
  9. airbyte_cdk/sources/file_based/stream/cursor/{file_based_cursor.py → abstract_file_based_cursor.py} +9 -1
  10. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +10 -10
  11. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +15 -2
  12. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/METADATA +1 -1
  13. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/RECORD +25 -24
  14. unit_tests/sources/file_based/config/test_csv_format.py +23 -0
  15. unit_tests/sources/file_based/file_types/test_csv_parser.py +50 -18
  16. unit_tests/sources/file_based/helpers.py +5 -0
  17. unit_tests/sources/file_based/in_memory_files_source.py +11 -3
  18. unit_tests/sources/file_based/scenarios/csv_scenarios.py +1254 -47
  19. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +6 -5
  20. unit_tests/sources/file_based/scenarios/scenario_builder.py +8 -7
  21. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +13 -12
  22. unit_tests/sources/file_based/test_scenarios.py +30 -0
  23. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/LICENSE.txt +0 -0
  24. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/WHEEL +0 -0
  25. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from unit_tests.sources.file_based.helpers import LowHistoryLimitCursor
5
6
  from unit_tests.sources.file_based.scenarios.scenario_builder import IncrementalScenarioConfig, TestScenarioBuilder
6
7
 
7
8
  single_csv_input_state_is_earlier_scenario = (
@@ -1004,7 +1005,7 @@ multi_csv_remove_old_files_if_history_is_full_scenario = (
1004
1005
  }
1005
1006
  )
1006
1007
  .set_file_type("csv")
1007
- .set_max_history_size(3)
1008
+ .set_cursor_cls(LowHistoryLimitCursor)
1008
1009
  .set_expected_catalog(
1009
1010
  {
1010
1011
  "streams": [
@@ -1151,7 +1152,7 @@ multi_csv_same_timestamp_more_files_than_history_size_scenario = (
1151
1152
  }
1152
1153
  )
1153
1154
  .set_file_type("csv")
1154
- .set_max_history_size(3)
1155
+ .set_cursor_cls(LowHistoryLimitCursor)
1155
1156
  .set_expected_catalog(
1156
1157
  {
1157
1158
  "streams": [
@@ -1268,7 +1269,7 @@ multi_csv_sync_recent_files_if_history_is_incomplete_scenario = (
1268
1269
  },
1269
1270
  }
1270
1271
  )
1271
- .set_max_history_size(3)
1272
+ .set_cursor_cls(LowHistoryLimitCursor)
1272
1273
  .set_file_type("csv")
1273
1274
  .set_expected_catalog(
1274
1275
  {
@@ -1386,7 +1387,7 @@ multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_time
1386
1387
  }
1387
1388
  )
1388
1389
  .set_file_type("csv")
1389
- .set_max_history_size(3)
1390
+ .set_cursor_cls(LowHistoryLimitCursor)
1390
1391
  .set_expected_catalog(
1391
1392
  {
1392
1393
  "streams": [
@@ -1509,7 +1510,7 @@ multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_differe
1509
1510
  }
1510
1511
  )
1511
1512
  .set_file_type("csv")
1512
- .set_max_history_size(3)
1513
+ .set_cursor_cls(LowHistoryLimitCursor)
1513
1514
  .set_expected_catalog(
1514
1515
  {
1515
1516
  "streams": [
@@ -11,10 +11,11 @@ from airbyte_cdk.sources.file_based.availability_strategy.abstract_file_based_av
11
11
  AbstractFileBasedAvailabilityStrategy,
12
12
  )
13
13
  from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
14
- from airbyte_cdk.sources.file_based.file_based_source import DEFAULT_MAX_HISTORY_SIZE, default_parsers
14
+ from airbyte_cdk.sources.file_based.file_based_source import default_parsers
15
15
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
16
16
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
17
17
  from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
18
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
18
19
  from unit_tests.sources.file_based.in_memory_files_source import InMemoryFilesSource
19
20
 
20
21
 
@@ -46,7 +47,7 @@ class TestScenario:
46
47
  expected_read_error: Tuple[Optional[Type[Exception]], Optional[str]],
47
48
  incremental_scenario_config: Optional[IncrementalScenarioConfig],
48
49
  file_write_options: Mapping[str, Any],
49
- max_history_size: int,
50
+ cursor_cls: Optional[Type[AbstractFileBasedCursor]],
50
51
  ):
51
52
  self.name = name
52
53
  self.config = config
@@ -68,7 +69,7 @@ class TestScenario:
68
69
  stream_reader,
69
70
  self.configured_catalog(SyncMode.incremental if incremental_scenario_config else SyncMode.full_refresh),
70
71
  file_write_options,
71
- max_history_size,
72
+ cursor_cls,
72
73
  )
73
74
  self.incremental_scenario_config = incremental_scenario_config
74
75
  self.validate()
@@ -124,7 +125,7 @@ class TestScenarioBuilder:
124
125
  self._expected_read_error: Tuple[Optional[Type[Exception]], Optional[str]] = None, None
125
126
  self._incremental_scenario_config: Optional[IncrementalScenarioConfig] = None
126
127
  self._file_write_options: Mapping[str, Any] = {}
127
- self._max_history_size = DEFAULT_MAX_HISTORY_SIZE
128
+ self._cursor_cls: Optional[Type[AbstractFileBasedCursor]] = None
128
129
 
129
130
  def set_name(self, name: str) -> "TestScenarioBuilder":
130
131
  self._name = name
@@ -182,8 +183,8 @@ class TestScenarioBuilder:
182
183
  self._stream_reader = stream_reader
183
184
  return self
184
185
 
185
- def set_max_history_size(self, max_history_size: int) -> "TestScenarioBuilder":
186
- self._max_history_size = max_history_size
186
+ def set_cursor_cls(self, cursor_cls: AbstractFileBasedCursor) -> "TestScenarioBuilder":
187
+ self._cursor_cls = cursor_cls
187
188
  return self
188
189
 
189
190
  def set_incremental_scenario_config(self, incremental_scenario_config: IncrementalScenarioConfig) -> "TestScenarioBuilder":
@@ -232,5 +233,5 @@ class TestScenarioBuilder:
232
233
  self._expected_read_error,
233
234
  self._incremental_scenario_config,
234
235
  self._file_write_options,
235
- self._max_history_size,
236
+ self._cursor_cls,
236
237
  )
@@ -7,6 +7,7 @@ from typing import Any, List, Mapping
7
7
  from unittest.mock import MagicMock
8
8
 
9
9
  import pytest
10
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
11
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
12
  from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
12
13
  from freezegun import freeze_time
@@ -103,7 +104,7 @@ from freezegun import freeze_time
103
104
  ],
104
105
  )
105
106
  def test_add_file(files_to_add: List[RemoteFile], expected_start_time: List[datetime], expected_state_dict: Mapping[str, Any]) -> None:
106
- cursor = DefaultFileBasedCursor(3, 3)
107
+ cursor = get_cursor(max_history_size=3, days_to_sync_if_history_is_full=3)
107
108
  assert cursor._compute_start_time() == datetime.min
108
109
 
109
110
  for index, f in enumerate(files_to_add):
@@ -160,7 +161,7 @@ def test_add_file(files_to_add: List[RemoteFile], expected_start_time: List[date
160
161
  ])
161
162
  def test_get_files_to_sync(files: List[RemoteFile], expected_files_to_sync: List[RemoteFile], max_history_size: int, history_is_partial: bool) -> None:
162
163
  logger = MagicMock()
163
- cursor = DefaultFileBasedCursor(max_history_size, 3)
164
+ cursor = get_cursor(max_history_size, 3)
164
165
 
165
166
  files_to_sync = list(cursor.get_files_to_sync(files, logger))
166
167
  for f in files_to_sync:
@@ -173,7 +174,7 @@ def test_get_files_to_sync(files: List[RemoteFile], expected_files_to_sync: List
173
174
  @freeze_time("2023-06-16T00:00:00Z")
174
175
  def test_only_recent_files_are_synced_if_history_is_full() -> None:
175
176
  logger = MagicMock()
176
- cursor = DefaultFileBasedCursor(2, 3)
177
+ cursor = get_cursor(2, 3)
177
178
 
178
179
  files_in_history = [
179
180
  RemoteFile(uri="b1.csv", last_modified=datetime(2021, 1, 2), file_type="csv"),
@@ -210,7 +211,7 @@ def test_only_recent_files_are_synced_if_history_is_full() -> None:
210
211
  ])
211
212
  def test_sync_file_already_present_in_history(modified_at_delta: timedelta, should_sync_file: bool) -> None:
212
213
  logger = MagicMock()
213
- cursor = DefaultFileBasedCursor(2, 3)
214
+ cursor = get_cursor(2, 3)
214
215
  original_modified_at = datetime(2021, 1, 2)
215
216
  filename = "a.csv"
216
217
  files_in_history = [
@@ -245,7 +246,7 @@ def test_sync_file_already_present_in_history(modified_at_delta: timedelta, shou
245
246
  )
246
247
  def test_should_sync_file(file_name: str, last_modified: datetime, earliest_dt_in_history: datetime, should_sync_file: bool) -> None:
247
248
  logger = MagicMock()
248
- cursor = DefaultFileBasedCursor(1, 3)
249
+ cursor = get_cursor(1, 3)
249
250
 
250
251
  cursor.add_file(RemoteFile(uri="b.csv", last_modified=earliest_dt_in_history, file_type="csv"))
251
252
  cursor._start_time = cursor._compute_start_time()
@@ -255,13 +256,13 @@ def test_should_sync_file(file_name: str, last_modified: datetime, earliest_dt_i
255
256
 
256
257
 
257
258
  def test_set_initial_state_no_history() -> None:
258
- cursor = DefaultFileBasedCursor(1, 3)
259
+ cursor = get_cursor(1, 3)
259
260
  cursor.set_initial_state({})
260
261
 
261
262
 
262
- def test_instantiate_with_negative_values() -> None:
263
- with pytest.raises(ValueError):
264
- DefaultFileBasedCursor(-1, 3)
265
-
266
- with pytest.raises(ValueError):
267
- DefaultFileBasedCursor(1, -3)
263
+ def get_cursor(max_history_size: int, days_to_sync_if_history_is_full: int) -> DefaultFileBasedCursor:
264
+ cursor_cls = DefaultFileBasedCursor
265
+ cursor_cls.DEFAULT_MAX_HISTORY_SIZE = max_history_size
266
+ config = FileBasedStreamConfig(
267
+ file_type="csv", name="test", validation_policy="emit_records", days_to_sync_if_history_is_full=days_to_sync_if_history_is_full)
268
+ return cursor_cls(config)
@@ -34,10 +34,25 @@ from unit_tests.sources.file_based.scenarios.check_scenarios import (
34
34
  success_user_provided_schema_scenario,
35
35
  )
36
36
  from unit_tests.sources.file_based.scenarios.csv_scenarios import (
37
+ csv_autogenerate_column_names_scenario,
38
+ csv_custom_bool_values_scenario,
39
+ csv_custom_delimiter_in_double_quotes_scenario,
40
+ csv_custom_delimiter_with_escape_char_scenario,
37
41
  csv_custom_format_scenario,
42
+ csv_custom_null_values_scenario,
43
+ csv_double_quote_is_set_scenario,
44
+ csv_escape_char_is_set_scenario,
38
45
  csv_legacy_format_scenario,
39
46
  csv_multi_stream_scenario,
47
+ csv_newline_in_values_not_quoted_scenario,
48
+ csv_newline_in_values_quoted_value_scenario,
40
49
  csv_single_stream_scenario,
50
+ csv_skip_after_header_scenario,
51
+ csv_skip_before_and_after_header_scenario,
52
+ csv_skip_before_header_scenario,
53
+ csv_string_can_be_null_with_input_schemas_scenario,
54
+ csv_string_not_null_if_no_null_values_scenario,
55
+ csv_strings_can_be_null_not_quoted_scenario,
41
56
  empty_schema_inference_scenario,
42
57
  invalid_csv_scenario,
43
58
  multi_csv_scenario,
@@ -162,11 +177,26 @@ discover_scenarios = [
162
177
  jsonl_user_input_schema_scenario,
163
178
  schemaless_jsonl_scenario,
164
179
  schemaless_jsonl_multi_stream_scenario,
180
+ csv_string_can_be_null_with_input_schemas_scenario,
181
+ csv_string_not_null_if_no_null_values_scenario,
182
+ csv_strings_can_be_null_not_quoted_scenario,
183
+ csv_newline_in_values_quoted_value_scenario,
184
+ csv_escape_char_is_set_scenario,
185
+ csv_double_quote_is_set_scenario,
186
+ csv_custom_delimiter_with_escape_char_scenario,
187
+ csv_custom_delimiter_in_double_quotes_scenario,
188
+ csv_skip_before_header_scenario,
189
+ csv_skip_after_header_scenario,
190
+ csv_skip_before_and_after_header_scenario,
191
+ csv_custom_bool_values_scenario,
192
+ csv_custom_null_values_scenario,
165
193
  single_avro_scenario,
166
194
  avro_all_types_scenario,
167
195
  multiple_avro_combine_schema_scenario,
168
196
  multiple_streams_avro_scenario,
169
197
  avro_file_with_decimal_as_float_scenario,
198
+ csv_newline_in_values_not_quoted_scenario,
199
+ csv_autogenerate_column_names_scenario,
170
200
  ]
171
201
 
172
202