airbyte-cdk 0.51.15__py3-none-any.whl → 0.51.17__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +494 -522
  2. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +1 -1
  3. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +2 -37
  4. airbyte_cdk/sources/file_based/file_based_source.py +1 -1
  5. airbyte_cdk/sources/file_based/file_types/__init__.py +11 -6
  6. airbyte_cdk/sources/file_based/file_types/avro_parser.py +1 -1
  7. airbyte_cdk/sources/file_based/file_types/csv_parser.py +1 -1
  8. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +2 -2
  9. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +5 -5
  10. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +7 -5
  11. airbyte_cdk/utils/datetime_format_inferrer.py +8 -4
  12. {airbyte_cdk-0.51.15.dist-info → airbyte_cdk-0.51.17.dist-info}/METADATA +1 -1
  13. {airbyte_cdk-0.51.15.dist-info → airbyte_cdk-0.51.17.dist-info}/RECORD +29 -29
  14. unit_tests/sources/file_based/file_types/test_avro_parser.py +6 -6
  15. unit_tests/sources/file_based/scenarios/avro_scenarios.py +5 -6
  16. unit_tests/sources/file_based/scenarios/check_scenarios.py +8 -8
  17. unit_tests/sources/file_based/scenarios/csv_scenarios.py +19 -42
  18. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +15 -15
  19. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +13 -12
  20. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +5 -9
  21. unit_tests/sources/file_based/scenarios/scenario_builder.py +1 -1
  22. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +16 -16
  23. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +9 -9
  24. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +2 -1
  25. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +6 -3
  26. unit_tests/utils/test_datetime_format_inferrer.py +1 -0
  27. {airbyte_cdk-0.51.15.dist-info → airbyte_cdk-0.51.17.dist-info}/LICENSE.txt +0 -0
  28. {airbyte_cdk-0.51.15.dist-info → airbyte_cdk-0.51.17.dist-info}/WHEEL +0 -0
  29. {airbyte_cdk-0.51.15.dist-info → airbyte_cdk-0.51.17.dist-info}/top_level.txt +0 -0
@@ -78,7 +78,7 @@ valid_single_stream_user_input_schema_scenario = (
78
78
  "streams": [
79
79
  {
80
80
  "name": "stream1",
81
- "file_type": "csv",
81
+ "format": {"filetype": "csv"},
82
82
  "globs": ["*"],
83
83
  "validation_policy": "Emit Record",
84
84
  "input_schema": '{"col1": "string", "col2": "string"}',
@@ -98,7 +98,7 @@ single_stream_user_input_schema_scenario_schema_is_invalid = (
98
98
  "streams": [
99
99
  {
100
100
  "name": "stream1",
101
- "file_type": "csv",
101
+ "format": {"filetype": "csv"},
102
102
  "globs": ["*"],
103
103
  "validation_policy": "Emit Record",
104
104
  "input_schema": '{"col1": "x", "col2": "string"}',
@@ -121,7 +121,7 @@ single_stream_user_input_schema_scenario_emit_nonconforming_records = (
121
121
  "streams": [
122
122
  {
123
123
  "name": "stream1",
124
- "file_type": "csv",
124
+ "format": {"filetype": "csv"},
125
125
  "globs": ["*"],
126
126
  "validation_policy": "Emit Record",
127
127
  "input_schema": '{"col1": "integer", "col2": "string"}',
@@ -171,7 +171,7 @@ single_stream_user_input_schema_scenario_skip_nonconforming_records = (
171
171
  "streams": [
172
172
  {
173
173
  "name": "stream1",
174
- "file_type": "csv",
174
+ "format": {"filetype": "csv"},
175
175
  "globs": ["*"],
176
176
  "validation_policy": "Skip Record",
177
177
  "input_schema": '{"col1": "integer", "col2": "string"}',
@@ -364,21 +364,21 @@ valid_multi_stream_user_input_schema_scenario = (
364
364
  "streams": [
365
365
  {
366
366
  "name": "stream1",
367
- "file_type": "csv",
367
+ "format": {"filetype": "csv"},
368
368
  "globs": ["a.csv"],
369
369
  "validation_policy": "Emit Record",
370
370
  "input_schema": '{"col1": "string", "col2": "integer"}',
371
371
  },
372
372
  {
373
373
  "name": "stream2",
374
- "file_type": "csv",
374
+ "format": {"filetype": "csv"},
375
375
  "globs": ["b.csv"],
376
376
  "validation_policy": "Emit Record",
377
377
  "input_schema": '{"col1": "string", "col2": "string", "col3": "string"}',
378
378
  },
379
379
  {
380
380
  "name": "stream3",
381
- "file_type": "csv",
381
+ "format": {"filetype": "csv"},
382
382
  "globs": ["c.csv"],
383
383
  "validation_policy": "Emit Record",
384
384
  },
@@ -398,21 +398,21 @@ multi_stream_user_input_schema_scenario_schema_is_invalid = (
398
398
  "streams": [
399
399
  {
400
400
  "name": "stream1",
401
- "file_type": "csv",
401
+ "format": {"filetype": "csv"},
402
402
  "globs": ["a.csv"],
403
403
  "validation_policy": "Emit Record",
404
404
  "input_schema": '{"col1": "string", "col2": "integer"}',
405
405
  },
406
406
  {
407
407
  "name": "stream2",
408
- "file_type": "csv",
408
+ "format": {"filetype": "csv"},
409
409
  "globs": ["b.csv"],
410
410
  "validation_policy": "Emit Record",
411
411
  "input_schema": '{"col1": "x", "col2": "string", "col3": "string"}', # this stream's schema is invalid
412
412
  },
413
413
  {
414
414
  "name": "stream3",
415
- "file_type": "csv",
415
+ "format": {"filetype": "csv"},
416
416
  "globs": ["c.csv"],
417
417
  "validation_policy": "Emit Record",
418
418
  },
@@ -435,21 +435,21 @@ multi_stream_user_input_schema_scenario_emit_nonconforming_records = (
435
435
  "streams": [
436
436
  {
437
437
  "name": "stream1",
438
- "file_type": "csv",
438
+ "format": {"filetype": "csv"},
439
439
  "globs": ["a.csv"],
440
440
  "validation_policy": "Emit Record",
441
441
  "input_schema": '{"col1": "string", "col2": "integer"}',
442
442
  },
443
443
  {
444
444
  "name": "stream2",
445
- "file_type": "csv",
445
+ "format": {"filetype": "csv"},
446
446
  "globs": ["b.csv"],
447
447
  "validation_policy": "Emit Record",
448
448
  "input_schema": '{"col1": "string", "col2": "integer", "col3": "string"}', # this stream's records do not conform to the schema
449
449
  },
450
450
  {
451
451
  "name": "stream3",
452
- "file_type": "csv",
452
+ "format": {"filetype": "csv"},
453
453
  "globs": ["c.csv"],
454
454
  "validation_policy": "Emit Record",
455
455
  },
@@ -574,21 +574,21 @@ multi_stream_user_input_schema_scenario_skip_nonconforming_records = (
574
574
  "streams": [
575
575
  {
576
576
  "name": "stream1",
577
- "file_type": "csv",
577
+ "format": {"filetype": "csv"},
578
578
  "globs": ["a.csv"],
579
579
  "validation_policy": "Emit Record",
580
580
  "input_schema": '{"col1": "string", "col2": "integer"}',
581
581
  },
582
582
  {
583
583
  "name": "stream2",
584
- "file_type": "csv",
584
+ "format": {"filetype": "csv"},
585
585
  "globs": ["b.csv"],
586
586
  "validation_policy": "Skip Record",
587
587
  "input_schema": '{"col1": "string", "col2": "integer", "col3": "string"}', # this stream's records do not conform to the schema
588
588
  },
589
589
  {
590
590
  "name": "stream3",
591
- "file_type": "csv",
591
+ "format": {"filetype": "csv"},
592
592
  "globs": ["c.csv"],
593
593
  "validation_policy": "Emit Record",
594
594
  },
@@ -204,7 +204,7 @@ skip_record_scenario_single_stream = (
204
204
  "streams": [
205
205
  {
206
206
  "name": "stream1",
207
- "file_type": "csv",
207
+ "format": {"filetype": "csv"},
208
208
  "globs": ["*.csv"],
209
209
  "validation_policy": "Skip Record",
210
210
  }
@@ -250,13 +250,13 @@ skip_record_scenario_multi_stream = (
250
250
  "streams": [
251
251
  {
252
252
  "name": "stream1",
253
- "file_type": "csv",
253
+ "format": {"filetype": "csv"},
254
254
  "globs": ["a/*.csv"],
255
255
  "validation_policy": "Skip Record",
256
256
  },
257
257
  {
258
258
  "name": "stream2",
259
- "file_type": "csv",
259
+ "format": {"filetype": "csv"},
260
260
  "globs": ["b/*.csv"],
261
261
  "validation_policy": "Skip Record",
262
262
  }
@@ -317,7 +317,7 @@ emit_record_scenario_single_stream = (
317
317
  "streams": [
318
318
  {
319
319
  "name": "stream1",
320
- "file_type": "csv",
320
+ "format": {"filetype": "csv"},
321
321
  "globs": ["*.csv"],
322
322
  "validation_policy": "Emit Record",
323
323
  }
@@ -359,13 +359,13 @@ emit_record_scenario_multi_stream = (
359
359
  "streams": [
360
360
  {
361
361
  "name": "stream1",
362
- "file_type": "csv",
362
+ "format": {"filetype": "csv"},
363
363
  "globs": ["a/*.csv"],
364
364
  "validation_policy": "Emit Record",
365
365
  },
366
366
  {
367
367
  "name": "stream2",
368
- "file_type": "csv",
368
+ "format": {"filetype": "csv"},
369
369
  "globs": ["b/*.csv"],
370
370
  "validation_policy": "Emit Record",
371
371
  }
@@ -418,7 +418,7 @@ wait_for_rediscovery_scenario_single_stream = (
418
418
  "streams": [
419
419
  {
420
420
  "name": "stream1",
421
- "file_type": "csv",
421
+ "format": {"filetype": "csv"},
422
422
  "globs": ["*.csv"],
423
423
  "validation_policy": "Wait for Discover",
424
424
  }
@@ -453,13 +453,13 @@ wait_for_rediscovery_scenario_multi_stream = (
453
453
  "streams": [
454
454
  {
455
455
  "name": "stream1",
456
- "file_type": "csv",
456
+ "format": {"filetype": "csv"},
457
457
  "globs": ["a/*.csv"],
458
458
  "validation_policy": "Wait for Discover",
459
459
  },
460
460
  {
461
461
  "name": "stream2",
462
- "file_type": "csv",
462
+ "format": {"filetype": "csv"},
463
463
  "globs": ["b/*.csv"],
464
464
  "validation_policy": "Wait for Discover",
465
465
  }
@@ -7,6 +7,7 @@ from typing import Any, List, Mapping
7
7
  from unittest.mock import MagicMock
8
8
 
9
9
  import pytest
10
+ from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
10
11
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
11
12
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
12
13
  from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
@@ -264,5 +265,5 @@ def get_cursor(max_history_size: int, days_to_sync_if_history_is_full: int) -> D
264
265
  cursor_cls = DefaultFileBasedCursor
265
266
  cursor_cls.DEFAULT_MAX_HISTORY_SIZE = max_history_size
266
267
  config = FileBasedStreamConfig(
267
- file_type="csv", name="test", validation_policy=ValidationPolicy.emit_record, days_to_sync_if_history_is_full=days_to_sync_if_history_is_full)
268
+ format=CsvFormat(), name="test", validation_policy=ValidationPolicy.emit_record, days_to_sync_if_history_is_full=days_to_sync_if_history_is_full)
268
269
  return cursor_cls(config)
@@ -19,6 +19,10 @@ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
19
19
  from airbyte_cdk.sources.file_based.stream.default_file_based_stream import DefaultFileBasedStream
20
20
 
21
21
 
22
+ class MockFormat:
23
+ pass
24
+
25
+
22
26
  @pytest.mark.parametrize(
23
27
  "input_schema, expected_output",
24
28
  [
@@ -60,13 +64,12 @@ def test_fill_nulls(input_schema: Mapping[str, Any], expected_output: Mapping[st
60
64
 
61
65
 
62
66
  class DefaultFileBasedStreamTest(unittest.TestCase):
63
- _FILE_TYPE = "file_type"
64
67
  _NOW = datetime(2022, 10, 22, tzinfo=timezone.utc)
65
68
  _A_RECORD = {"a_record": 1}
66
69
 
67
70
  def setUp(self) -> None:
68
71
  self._stream_config = Mock()
69
- self._stream_config.file_type = self._FILE_TYPE
72
+ self._stream_config.format = MockFormat()
70
73
  self._stream_config.name = "a stream name"
71
74
  self._catalog_schema = Mock()
72
75
  self._stream_reader = Mock(spec=AbstractFileBasedStreamReader)
@@ -83,7 +86,7 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
83
86
  stream_reader=self._stream_reader,
84
87
  availability_strategy=self._availability_strategy,
85
88
  discovery_policy=self._discovery_policy,
86
- parsers={self._FILE_TYPE: self._parser},
89
+ parsers={MockFormat: self._parser},
87
90
  validation_policy=self._validation_policy,
88
91
  cursor=self._cursor,
89
92
  )
@@ -22,6 +22,7 @@ NOW = 1234567
22
22
  ("timestamp_ms_match_string", [{"d": "1686058051000"}], {"d": "%ms"}),
23
23
  ("timestamp_no_match_integer", [{"d": 99}], {}),
24
24
  ("timestamp_no_match_string", [{"d": "99999999999999999999"}], {}),
25
+ ("timestamp_overflow", [{"d": f"{10**100}_100"}], {}), # this case was previously causing OverflowError hence this test
25
26
  ("simple_no_match", [{"d": "20220203"}], {}),
26
27
  ("multiple_match", [{"d": "2022-02-03", "e": "2022-02-03"}], {"d": "%Y-%m-%d", "e": "%Y-%m-%d"}),
27
28
  (