airbyte-cdk 0.50.0__py3-none-any.whl → 0.50.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (25) hide show
  1. airbyte_cdk/entrypoint.py +7 -0
  2. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3 -3
  3. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +3 -3
  4. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +9 -9
  5. airbyte_cdk/sources/file_based/config/csv_format.py +42 -6
  6. airbyte_cdk/sources/file_based/file_based_source.py +4 -5
  7. airbyte_cdk/sources/file_based/file_types/csv_parser.py +114 -59
  8. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +2 -2
  9. airbyte_cdk/sources/file_based/stream/cursor/{file_based_cursor.py → abstract_file_based_cursor.py} +9 -1
  10. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +10 -10
  11. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +15 -2
  12. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/METADATA +1 -1
  13. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/RECORD +25 -24
  14. unit_tests/sources/file_based/config/test_csv_format.py +23 -0
  15. unit_tests/sources/file_based/file_types/test_csv_parser.py +50 -18
  16. unit_tests/sources/file_based/helpers.py +5 -0
  17. unit_tests/sources/file_based/in_memory_files_source.py +11 -3
  18. unit_tests/sources/file_based/scenarios/csv_scenarios.py +1254 -47
  19. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +6 -5
  20. unit_tests/sources/file_based/scenarios/scenario_builder.py +8 -7
  21. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +13 -12
  22. unit_tests/sources/file_based/test_scenarios.py +30 -0
  23. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/LICENSE.txt +0 -0
  24. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/WHEEL +0 -0
  25. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from unit_tests.sources.file_based.helpers import LowHistoryLimitCursor
5
6
  from unit_tests.sources.file_based.scenarios.scenario_builder import IncrementalScenarioConfig, TestScenarioBuilder
6
7
 
7
8
  single_csv_input_state_is_earlier_scenario = (
@@ -1004,7 +1005,7 @@ multi_csv_remove_old_files_if_history_is_full_scenario = (
1004
1005
  }
1005
1006
  )
1006
1007
  .set_file_type("csv")
1007
- .set_max_history_size(3)
1008
+ .set_cursor_cls(LowHistoryLimitCursor)
1008
1009
  .set_expected_catalog(
1009
1010
  {
1010
1011
  "streams": [
@@ -1151,7 +1152,7 @@ multi_csv_same_timestamp_more_files_than_history_size_scenario = (
1151
1152
  }
1152
1153
  )
1153
1154
  .set_file_type("csv")
1154
- .set_max_history_size(3)
1155
+ .set_cursor_cls(LowHistoryLimitCursor)
1155
1156
  .set_expected_catalog(
1156
1157
  {
1157
1158
  "streams": [
@@ -1268,7 +1269,7 @@ multi_csv_sync_recent_files_if_history_is_incomplete_scenario = (
1268
1269
  },
1269
1270
  }
1270
1271
  )
1271
- .set_max_history_size(3)
1272
+ .set_cursor_cls(LowHistoryLimitCursor)
1272
1273
  .set_file_type("csv")
1273
1274
  .set_expected_catalog(
1274
1275
  {
@@ -1386,7 +1387,7 @@ multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_time
1386
1387
  }
1387
1388
  )
1388
1389
  .set_file_type("csv")
1389
- .set_max_history_size(3)
1390
+ .set_cursor_cls(LowHistoryLimitCursor)
1390
1391
  .set_expected_catalog(
1391
1392
  {
1392
1393
  "streams": [
@@ -1509,7 +1510,7 @@ multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_differe
1509
1510
  }
1510
1511
  )
1511
1512
  .set_file_type("csv")
1512
- .set_max_history_size(3)
1513
+ .set_cursor_cls(LowHistoryLimitCursor)
1513
1514
  .set_expected_catalog(
1514
1515
  {
1515
1516
  "streams": [
@@ -11,10 +11,11 @@ from airbyte_cdk.sources.file_based.availability_strategy.abstract_file_based_av
11
11
  AbstractFileBasedAvailabilityStrategy,
12
12
  )
13
13
  from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
14
- from airbyte_cdk.sources.file_based.file_based_source import DEFAULT_MAX_HISTORY_SIZE, default_parsers
14
+ from airbyte_cdk.sources.file_based.file_based_source import default_parsers
15
15
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
16
16
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
17
17
  from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
18
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
18
19
  from unit_tests.sources.file_based.in_memory_files_source import InMemoryFilesSource
19
20
 
20
21
 
@@ -46,7 +47,7 @@ class TestScenario:
46
47
  expected_read_error: Tuple[Optional[Type[Exception]], Optional[str]],
47
48
  incremental_scenario_config: Optional[IncrementalScenarioConfig],
48
49
  file_write_options: Mapping[str, Any],
49
- max_history_size: int,
50
+ cursor_cls: Optional[Type[AbstractFileBasedCursor]],
50
51
  ):
51
52
  self.name = name
52
53
  self.config = config
@@ -68,7 +69,7 @@ class TestScenario:
68
69
  stream_reader,
69
70
  self.configured_catalog(SyncMode.incremental if incremental_scenario_config else SyncMode.full_refresh),
70
71
  file_write_options,
71
- max_history_size,
72
+ cursor_cls,
72
73
  )
73
74
  self.incremental_scenario_config = incremental_scenario_config
74
75
  self.validate()
@@ -124,7 +125,7 @@ class TestScenarioBuilder:
124
125
  self._expected_read_error: Tuple[Optional[Type[Exception]], Optional[str]] = None, None
125
126
  self._incremental_scenario_config: Optional[IncrementalScenarioConfig] = None
126
127
  self._file_write_options: Mapping[str, Any] = {}
127
- self._max_history_size = DEFAULT_MAX_HISTORY_SIZE
128
+ self._cursor_cls: Optional[Type[AbstractFileBasedCursor]] = None
128
129
 
129
130
  def set_name(self, name: str) -> "TestScenarioBuilder":
130
131
  self._name = name
@@ -182,8 +183,8 @@ class TestScenarioBuilder:
182
183
  self._stream_reader = stream_reader
183
184
  return self
184
185
 
185
- def set_max_history_size(self, max_history_size: int) -> "TestScenarioBuilder":
186
- self._max_history_size = max_history_size
186
+ def set_cursor_cls(self, cursor_cls: AbstractFileBasedCursor) -> "TestScenarioBuilder":
187
+ self._cursor_cls = cursor_cls
187
188
  return self
188
189
 
189
190
  def set_incremental_scenario_config(self, incremental_scenario_config: IncrementalScenarioConfig) -> "TestScenarioBuilder":
@@ -232,5 +233,5 @@ class TestScenarioBuilder:
232
233
  self._expected_read_error,
233
234
  self._incremental_scenario_config,
234
235
  self._file_write_options,
235
- self._max_history_size,
236
+ self._cursor_cls,
236
237
  )
@@ -7,6 +7,7 @@ from typing import Any, List, Mapping
7
7
  from unittest.mock import MagicMock
8
8
 
9
9
  import pytest
10
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
11
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
12
  from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
12
13
  from freezegun import freeze_time
@@ -103,7 +104,7 @@ from freezegun import freeze_time
103
104
  ],
104
105
  )
105
106
  def test_add_file(files_to_add: List[RemoteFile], expected_start_time: List[datetime], expected_state_dict: Mapping[str, Any]) -> None:
106
- cursor = DefaultFileBasedCursor(3, 3)
107
+ cursor = get_cursor(max_history_size=3, days_to_sync_if_history_is_full=3)
107
108
  assert cursor._compute_start_time() == datetime.min
108
109
 
109
110
  for index, f in enumerate(files_to_add):
@@ -160,7 +161,7 @@ def test_add_file(files_to_add: List[RemoteFile], expected_start_time: List[date
160
161
  ])
161
162
  def test_get_files_to_sync(files: List[RemoteFile], expected_files_to_sync: List[RemoteFile], max_history_size: int, history_is_partial: bool) -> None:
162
163
  logger = MagicMock()
163
- cursor = DefaultFileBasedCursor(max_history_size, 3)
164
+ cursor = get_cursor(max_history_size, 3)
164
165
 
165
166
  files_to_sync = list(cursor.get_files_to_sync(files, logger))
166
167
  for f in files_to_sync:
@@ -173,7 +174,7 @@ def test_get_files_to_sync(files: List[RemoteFile], expected_files_to_sync: List
173
174
  @freeze_time("2023-06-16T00:00:00Z")
174
175
  def test_only_recent_files_are_synced_if_history_is_full() -> None:
175
176
  logger = MagicMock()
176
- cursor = DefaultFileBasedCursor(2, 3)
177
+ cursor = get_cursor(2, 3)
177
178
 
178
179
  files_in_history = [
179
180
  RemoteFile(uri="b1.csv", last_modified=datetime(2021, 1, 2), file_type="csv"),
@@ -210,7 +211,7 @@ def test_only_recent_files_are_synced_if_history_is_full() -> None:
210
211
  ])
211
212
  def test_sync_file_already_present_in_history(modified_at_delta: timedelta, should_sync_file: bool) -> None:
212
213
  logger = MagicMock()
213
- cursor = DefaultFileBasedCursor(2, 3)
214
+ cursor = get_cursor(2, 3)
214
215
  original_modified_at = datetime(2021, 1, 2)
215
216
  filename = "a.csv"
216
217
  files_in_history = [
@@ -245,7 +246,7 @@ def test_sync_file_already_present_in_history(modified_at_delta: timedelta, shou
245
246
  )
246
247
  def test_should_sync_file(file_name: str, last_modified: datetime, earliest_dt_in_history: datetime, should_sync_file: bool) -> None:
247
248
  logger = MagicMock()
248
- cursor = DefaultFileBasedCursor(1, 3)
249
+ cursor = get_cursor(1, 3)
249
250
 
250
251
  cursor.add_file(RemoteFile(uri="b.csv", last_modified=earliest_dt_in_history, file_type="csv"))
251
252
  cursor._start_time = cursor._compute_start_time()
@@ -255,13 +256,13 @@ def test_should_sync_file(file_name: str, last_modified: datetime, earliest_dt_i
255
256
 
256
257
 
257
258
  def test_set_initial_state_no_history() -> None:
258
- cursor = DefaultFileBasedCursor(1, 3)
259
+ cursor = get_cursor(1, 3)
259
260
  cursor.set_initial_state({})
260
261
 
261
262
 
262
- def test_instantiate_with_negative_values() -> None:
263
- with pytest.raises(ValueError):
264
- DefaultFileBasedCursor(-1, 3)
265
-
266
- with pytest.raises(ValueError):
267
- DefaultFileBasedCursor(1, -3)
263
+ def get_cursor(max_history_size: int, days_to_sync_if_history_is_full: int) -> DefaultFileBasedCursor:
264
+ cursor_cls = DefaultFileBasedCursor
265
+ cursor_cls.DEFAULT_MAX_HISTORY_SIZE = max_history_size
266
+ config = FileBasedStreamConfig(
267
+ file_type="csv", name="test", validation_policy="emit_records", days_to_sync_if_history_is_full=days_to_sync_if_history_is_full)
268
+ return cursor_cls(config)
@@ -34,10 +34,25 @@ from unit_tests.sources.file_based.scenarios.check_scenarios import (
34
34
  success_user_provided_schema_scenario,
35
35
  )
36
36
  from unit_tests.sources.file_based.scenarios.csv_scenarios import (
37
+ csv_autogenerate_column_names_scenario,
38
+ csv_custom_bool_values_scenario,
39
+ csv_custom_delimiter_in_double_quotes_scenario,
40
+ csv_custom_delimiter_with_escape_char_scenario,
37
41
  csv_custom_format_scenario,
42
+ csv_custom_null_values_scenario,
43
+ csv_double_quote_is_set_scenario,
44
+ csv_escape_char_is_set_scenario,
38
45
  csv_legacy_format_scenario,
39
46
  csv_multi_stream_scenario,
47
+ csv_newline_in_values_not_quoted_scenario,
48
+ csv_newline_in_values_quoted_value_scenario,
40
49
  csv_single_stream_scenario,
50
+ csv_skip_after_header_scenario,
51
+ csv_skip_before_and_after_header_scenario,
52
+ csv_skip_before_header_scenario,
53
+ csv_string_can_be_null_with_input_schemas_scenario,
54
+ csv_string_not_null_if_no_null_values_scenario,
55
+ csv_strings_can_be_null_not_quoted_scenario,
41
56
  empty_schema_inference_scenario,
42
57
  invalid_csv_scenario,
43
58
  multi_csv_scenario,
@@ -162,11 +177,26 @@ discover_scenarios = [
162
177
  jsonl_user_input_schema_scenario,
163
178
  schemaless_jsonl_scenario,
164
179
  schemaless_jsonl_multi_stream_scenario,
180
+ csv_string_can_be_null_with_input_schemas_scenario,
181
+ csv_string_not_null_if_no_null_values_scenario,
182
+ csv_strings_can_be_null_not_quoted_scenario,
183
+ csv_newline_in_values_quoted_value_scenario,
184
+ csv_escape_char_is_set_scenario,
185
+ csv_double_quote_is_set_scenario,
186
+ csv_custom_delimiter_with_escape_char_scenario,
187
+ csv_custom_delimiter_in_double_quotes_scenario,
188
+ csv_skip_before_header_scenario,
189
+ csv_skip_after_header_scenario,
190
+ csv_skip_before_and_after_header_scenario,
191
+ csv_custom_bool_values_scenario,
192
+ csv_custom_null_values_scenario,
165
193
  single_avro_scenario,
166
194
  avro_all_types_scenario,
167
195
  multiple_avro_combine_schema_scenario,
168
196
  multiple_streams_avro_scenario,
169
197
  avro_file_with_decimal_as_float_scenario,
198
+ csv_newline_in_values_not_quoted_scenario,
199
+ csv_autogenerate_column_names_scenario,
170
200
  ]
171
201
 
172
202