airbyte-cdk 0.40.2__py3-none-any.whl → 0.40.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,7 @@ from airbyte_cdk.entrypoint import AirbyteEntrypoint
14
14
  from airbyte_cdk.sources import AbstractSource
15
15
  from airbyte_cdk.sources.declarative.declarative_source import DeclarativeSource
16
16
  from airbyte_cdk.utils import AirbyteTracedException
17
+ from airbyte_cdk.utils.datetime_format_inferrer import DatetimeFormatInferrer
17
18
  from airbyte_cdk.utils.schema_inferrer import SchemaInferrer
18
19
  from airbyte_protocol.models.airbyte_protocol import (
19
20
  AirbyteControlMessage,
@@ -46,6 +47,7 @@ class MessageGrouper:
46
47
  if record_limit is not None and not (1 <= record_limit <= 1000):
47
48
  raise ValueError(f"Record limit must be between 1 and 1000. Got {record_limit}")
48
49
  schema_inferrer = SchemaInferrer()
50
+ datetime_format_inferrer = DatetimeFormatInferrer()
49
51
 
50
52
  if record_limit is None:
51
53
  record_limit = self._max_record_limit
@@ -58,6 +60,7 @@ class MessageGrouper:
58
60
  for message_group in self._get_message_groups(
59
61
  self._read_stream(source, config, configured_catalog),
60
62
  schema_inferrer,
63
+ datetime_format_inferrer,
61
64
  record_limit,
62
65
  ):
63
66
  if isinstance(message_group, AirbyteLogMessage):
@@ -80,10 +83,11 @@ class MessageGrouper:
80
83
  configured_catalog.streams[0].stream.name
81
84
  ), # The connector builder currently only supports reading from a single stream at a time
82
85
  latest_config_update=latest_config_update.connectorConfig.config if latest_config_update else self._clean_config(config),
86
+ inferred_datetime_formats=datetime_format_inferrer.get_inferred_datetime_formats(),
83
87
  )
84
88
 
85
89
  def _get_message_groups(
86
- self, messages: Iterator[AirbyteMessage], schema_inferrer: SchemaInferrer, limit: int
90
+ self, messages: Iterator[AirbyteMessage], schema_inferrer: SchemaInferrer, datetime_format_inferrer: DatetimeFormatInferrer, limit: int
87
91
  ) -> Iterable[Union[StreamReadPages, AirbyteControlMessage, AirbyteLogMessage, AirbyteTraceMessage]]:
88
92
  """
89
93
  Message groups are partitioned according to when request log messages are received. Subsequent response log messages
@@ -141,6 +145,7 @@ class MessageGrouper:
141
145
  current_page_records.append(message.record.data)
142
146
  records_count += 1
143
147
  schema_inferrer.accumulate(message.record)
148
+ datetime_format_inferrer.accumulate(message.record)
144
149
  elif message.type == MessageType.CONTROL and message.control.type == OrchestratorType.CONNECTOR_CONFIG:
145
150
  yield message.control
146
151
  else:
@@ -48,6 +48,7 @@ class StreamRead(object):
48
48
  slices: List[StreamReadSlices]
49
49
  test_read_limit_reached: bool
50
50
  inferred_schema: Optional[Dict[str, Any]]
51
+ inferred_datetime_formats: Optional[Dict[str, str]]
51
52
  latest_config_update: Optional[Dict[str, Any]]
52
53
 
53
54
 
@@ -0,0 +1,80 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from typing import Any, Dict, Union
6
+
7
+ from airbyte_cdk.models import AirbyteRecordMessage
8
+ from airbyte_cdk.sources.declarative.datetime.datetime_parser import DatetimeParser
9
+
10
+
11
+ class DatetimeFormatInferrer:
12
+ """
13
+ This class is used to detect toplevel fields in records that might be datetime values, along with the used format.
14
+ """
15
+
16
+ def __init__(self):
17
+ self._parser = DatetimeParser()
18
+ self._datetime_candidates: Union[None, Dict[str, str]] = None
19
+ self._formats = [
20
+ "%Y-%m-%d",
21
+ "%Y-%m-%d %H:%M:%S",
22
+ "%Y-%m-%d %H:%M:%S.%f+00:00",
23
+ "%Y-%m-%dT%H:%M:%S.%f%z",
24
+ "%s",
25
+ "%d/%m/%Y %H:%M",
26
+ "%Y-%m",
27
+ "%d-%m-%Y",
28
+ "%Y-%m-%dT%H:%M:%SZ",
29
+ ]
30
+ self._timestamp_heuristic_range = range(1_000_000_000, 2_000_000_000)
31
+
32
+ def _can_be_datetime(self, value: Any) -> bool:
33
+ """Checks if the value can be a datetime. This is the case if the value is a string or an integer between 1_000_000_000 and 2_000_000_000. This is separate from the format check for performance reasons"""
34
+ if isinstance(value, str) and (not value.isdecimal() or int(value) in self._timestamp_heuristic_range):
35
+ return True
36
+ if isinstance(value, int) and value in self._timestamp_heuristic_range:
37
+ return True
38
+ return False
39
+
40
+ def _matches_format(self, value: Any, format: str) -> bool:
41
+ """Checks if the value matches the format"""
42
+ try:
43
+ self._parser.parse(value, format)
44
+ return True
45
+ except ValueError:
46
+ return False
47
+
48
+ def _initialize(self, record: AirbyteRecordMessage):
49
+ """Initializes the internal state of the class"""
50
+ self._datetime_candidates = {}
51
+ for field_name, field_value in record.data.items():
52
+ if not self._can_be_datetime(field_value):
53
+ continue
54
+ for format in self._formats:
55
+ if self._matches_format(field_value, format):
56
+ self._datetime_candidates[field_name] = format
57
+ break
58
+
59
+ def _validate(self, record: AirbyteRecordMessage):
60
+ """Validates that the record is consistent with the inferred datetime formats"""
61
+ for candidate_field_name in list(self._datetime_candidates.keys()):
62
+ candidate_field_format = self._datetime_candidates[candidate_field_name]
63
+ current_value = record.data.get(candidate_field_name, None)
64
+ if (
65
+ current_value is None
66
+ or not self._can_be_datetime(current_value)
67
+ or not self._matches_format(current_value, candidate_field_format)
68
+ ):
69
+ self._datetime_candidates.pop(candidate_field_name)
70
+
71
+ def accumulate(self, record: AirbyteRecordMessage):
72
+ """Analyzes the record and updates the internal state of candidate datetime fields"""
73
+ self._initialize(record) if self._datetime_candidates is None else self._validate(record)
74
+
75
+ def get_inferred_datetime_formats(self) -> Dict[str, str]:
76
+ """
77
+ Returns the list of candidate datetime fields - the keys are the field names and the values are the inferred datetime formats.
78
+ For these fields the format was consistent across all visited records.
79
+ """
80
+ return self._datetime_candidates or {}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.40.2
3
+ Version: 0.40.3
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -8,8 +8,8 @@ airbyte_cdk/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  airbyte_cdk/connector_builder/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
9
9
  airbyte_cdk/connector_builder/connector_builder_handler.py,sha256=q8mqQjNqpvHZgwVbNuvSe19o4Aw6MQTuhA2URmdz0K0,5443
10
10
  airbyte_cdk/connector_builder/main.py,sha256=jn2gqaYAvd6uDoFe0oVhnY23grm5sL-jfIX6kGvhVxk,2994
11
- airbyte_cdk/connector_builder/message_grouper.py,sha256=uJGOBhinvbisgAa-bQN3XE2L2xFTeVeykLwDCRYcxgc,12110
12
- airbyte_cdk/connector_builder/models.py,sha256=yW_j91B-3FYNTNbWjR2ZVYTXBHlskT55uxdAqg7FhAE,1221
11
+ airbyte_cdk/connector_builder/message_grouper.py,sha256=yEjvwdXgzYK29xwjl88-4s-J49iaud8_aOrAlOkAzsg,12504
12
+ airbyte_cdk/connector_builder/models.py,sha256=jL2SJIWJTLCbBqobw5Qo8WGS0aN-K9TRmfSpDHM5vYc,1277
13
13
  airbyte_cdk/destinations/__init__.py,sha256=0Uxmz3iBAyZJdk_bqUVt2pb0UwRTpFjTnFE6fQFbWKY,126
14
14
  airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQwQj8MyVm3BI,5420
15
15
  airbyte_cdk/models/__init__.py,sha256=LPQcYdDPwrCXiBPe_jexO4UAcbovIb1V9tHB6I7Un30,633
@@ -156,6 +156,7 @@ airbyte_cdk/sources/utils/schema_models.py,sha256=m1vOqNkkVYGblc492wKo11Zm5FK9F0
156
156
  airbyte_cdk/sources/utils/transform.py,sha256=4GYmO6bq33HF-a1in0dKQKqUOYI1bWItyuYF875bSQg,9493
157
157
  airbyte_cdk/utils/__init__.py,sha256=kFLcs2P-tbPyeVOJS9rOv1jZdnSpjG24ro0CHgt_CIk,215
158
158
  airbyte_cdk/utils/airbyte_secrets_utils.py,sha256=q3aDl8T10ufGbeqnUPqbZLxQcHdkf2kDfQK_upWzBbI,2894
159
+ airbyte_cdk/utils/datetime_format_inferrer.py,sha256=1z5lGq_DI9LFrT68ftlJSqndS6i-Rs1PX7T_RBtOJpA,3443
159
160
  airbyte_cdk/utils/event_timing.py,sha256=Hn5kCc9xGKLcV5EYpJCZwNiz9neKKu2WG8FJF_hy278,2377
160
161
  airbyte_cdk/utils/schema_inferrer.py,sha256=j0us_mEMj8PVVzSZfoS1adK7V7a--mSHQozo6xmsiIc,3720
161
162
  airbyte_cdk/utils/stream_status_utils.py,sha256=X1Vy7BhglycjdIWpfKDfwJussNCxYffelKt6Utjx-qY,1005
@@ -163,8 +164,8 @@ airbyte_cdk/utils/traced_exception.py,sha256=9G2sG9eYkvn6Aa7rMuUW_KIRszRaTc_xdnT
163
164
  source_declarative_manifest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
164
165
  source_declarative_manifest/main.py,sha256=HXzuRsRyhHwPrGU-hc4S7RrgoOoHImqkdfbmO2geBeE,1027
165
166
  unit_tests/connector_builder/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
166
- unit_tests/connector_builder/test_connector_builder_handler.py,sha256=V9p7AFECaLqSK-iGvu0OqwV6qREQC2BhWo0H4OoiiK4,26895
167
- unit_tests/connector_builder/test_message_grouper.py,sha256=XMVRW45RDTgy1YVzkV-jOXj7Ar2mzgDV8OW2QDzZjYU,28510
167
+ unit_tests/connector_builder/test_connector_builder_handler.py,sha256=UtGSzZshZeWZcc5lt3Kt6-8aDFFwj2sLvzjCBfPkrkg,27054
168
+ unit_tests/connector_builder/test_message_grouper.py,sha256=Rek2qmuexLtfsQmHEUR_7FH-eDg3CnFiOOWVUgB9ow8,28802
168
169
  unit_tests/connector_builder/utils.py,sha256=AAggdGWP-mNuWOZUHLAVIbjTeIcdPo-3pbMm5zdYpS0,796
169
170
  unit_tests/destinations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
170
171
  unit_tests/destinations/test_destination.py,sha256=koG_j812KMkcIxoUH6XlAL3zsephZJmlHvyzJXm0dCs,10269
@@ -258,12 +259,13 @@ unit_tests/sources/streams/http/auth/test_auth.py,sha256=gdWpJ-cR64qRXmmPOQWhVd4
258
259
  unit_tests/sources/streams/http/requests_native_auth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
259
260
  unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py,sha256=_BZVsG_LZUXfBmHWTlKIw65eGkdwFSiKRlpjsccj61U,12396
260
261
  unit_tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
262
+ unit_tests/utils/test_datetime_format_inferrer.py,sha256=Io2o5flTre9gyI_IDDMpzxOjCz3sr16LO0GRqOD59uk,2946
261
263
  unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg4MNPAG-xhpk,7817
262
264
  unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
263
265
  unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
264
266
  unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
265
- airbyte_cdk-0.40.2.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
266
- airbyte_cdk-0.40.2.dist-info/METADATA,sha256=Ts5OITHn2vkPFfNNDy5V3dylvBgEZXRPIKVmuCvijZw,8902
267
- airbyte_cdk-0.40.2.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
268
- airbyte_cdk-0.40.2.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
269
- airbyte_cdk-0.40.2.dist-info/RECORD,,
267
+ airbyte_cdk-0.40.3.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
268
+ airbyte_cdk-0.40.3.dist-info/METADATA,sha256=pAfHdGCbN9Iz4q4xcnO3z3sATNNzWz4h7KX5eUQGq1I,8902
269
+ airbyte_cdk-0.40.3.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
270
+ airbyte_cdk-0.40.3.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
271
+ airbyte_cdk-0.40.3.dist-info/RECORD,,
@@ -354,6 +354,7 @@ def test_read():
354
354
  ],
355
355
  test_read_limit_reached=False,
356
356
  inferred_schema=None,
357
+ inferred_datetime_formats=None,
357
358
  latest_config_update={}
358
359
  )
359
360
 
@@ -368,6 +369,7 @@ def test_read():
368
369
  ],
369
370
  "test_read_limit_reached": False,
370
371
  "inferred_schema": None,
372
+ "inferred_datetime_formats": None,
371
373
  "latest_config_update": {}
372
374
  },
373
375
  emitted_at=1,
@@ -410,6 +412,7 @@ def test_read_returns_error_response(mock_from_exception):
410
412
  slice_descriptor=None, state=None)],
411
413
  test_read_limit_reached=False,
412
414
  inferred_schema=None,
415
+ inferred_datetime_formats={},
413
416
  latest_config_update={})
414
417
 
415
418
  expected_message = AirbyteMessage(
@@ -94,7 +94,8 @@ def test_get_grouped_messages(mock_entrypoint_read):
94
94
  "body": {"custom": "field"},
95
95
  }
96
96
  response = {"status_code": 200, "headers": {"field": "value"}, "body": '{"name": "field"}', "http_method": "GET"}
97
- expected_schema = {"$schema": "http://json-schema.org/schema#", "properties": {"name": {"type": "string"}}, "type": "object"}
97
+ expected_schema = {"$schema": "http://json-schema.org/schema#", "properties": {"name": {"type": "string"}, "date": {"type": "string"}}, "type": "object"}
98
+ expected_datetime_fields = {"date":"%Y-%m-%d"}
98
99
  expected_pages = [
99
100
  StreamReadPages(
100
101
  request=HttpRequest(
@@ -105,7 +106,7 @@ def test_get_grouped_messages(mock_entrypoint_read):
105
106
  http_method="GET",
106
107
  ),
107
108
  response=HttpResponse(status=200, headers={"field": "value"}, body='{"name": "field"}'),
108
- records=[{"name": "Shinobu Kocho"}, {"name": "Muichiro Tokito"}],
109
+ records=[{"name": "Shinobu Kocho", "date": "2023-03-03"}, {"name": "Muichiro Tokito", "date": "2023-03-04"}],
109
110
  ),
110
111
  StreamReadPages(
111
112
  request=HttpRequest(
@@ -116,7 +117,7 @@ def test_get_grouped_messages(mock_entrypoint_read):
116
117
  http_method="GET",
117
118
  ),
118
119
  response=HttpResponse(status=200, headers={"field": "value"}, body='{"name": "field"}'),
119
- records=[{"name": "Mitsuri Kanroji"}],
120
+ records=[{"name": "Mitsuri Kanroji", "date": "2023-03-05"}],
120
121
  ),
121
122
  ]
122
123
 
@@ -124,11 +125,11 @@ def test_get_grouped_messages(mock_entrypoint_read):
124
125
  [
125
126
  request_log_message(request),
126
127
  response_log_message(response),
127
- record_message("hashiras", {"name": "Shinobu Kocho"}),
128
- record_message("hashiras", {"name": "Muichiro Tokito"}),
128
+ record_message("hashiras", {"name": "Shinobu Kocho", "date": "2023-03-03"}),
129
+ record_message("hashiras", {"name": "Muichiro Tokito", "date": "2023-03-04"}),
129
130
  request_log_message(request),
130
131
  response_log_message(response),
131
- record_message("hashiras", {"name": "Mitsuri Kanroji"}),
132
+ record_message("hashiras", {"name": "Mitsuri Kanroji", "date": "2023-03-05"}),
132
133
  ]
133
134
  ))
134
135
 
@@ -138,6 +139,7 @@ def test_get_grouped_messages(mock_entrypoint_read):
138
139
  )
139
140
 
140
141
  assert actual_response.inferred_schema == expected_schema
142
+ assert actual_response.inferred_datetime_formats == expected_datetime_fields
141
143
 
142
144
  single_slice = actual_response.slices[0]
143
145
  for i, actual_page in enumerate(single_slice.pages):
@@ -0,0 +1,53 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from typing import Dict, List
6
+
7
+ import pytest
8
+ from airbyte_cdk.models.airbyte_protocol import AirbyteRecordMessage
9
+ from airbyte_cdk.utils.datetime_format_inferrer import DatetimeFormatInferrer
10
+
11
+ NOW = 1234567
12
+
13
+
14
+ @pytest.mark.parametrize(
15
+ "test_name,input_records,expected_candidate_fields",
16
+ [
17
+ ("empty", [], {}),
18
+ ("simple_match", [{"d": "2022-02-03"}], {"d": "%Y-%m-%d"}),
19
+ ("timestamp_match_integer", [{"d": 1686058051}], {"d": "%s"}),
20
+ ("timestamp_match_string", [{"d": "1686058051"}], {"d": "%s"}),
21
+ ("timestamp_no_match_integer", [{"d": 99}], {}),
22
+ ("timestamp_no_match_string", [{"d": "99999999999999999999"}], {}),
23
+ ("simple_no_match", [{"d": "20220203"}], {}),
24
+ ("multiple_match", [{"d": "2022-02-03", "e": "2022-02-03"}], {"d": "%Y-%m-%d", "e": "%Y-%m-%d"}),
25
+ (
26
+ "multiple_no_match",
27
+ [{"d": "20220203", "r": "ccc", "e": {"something-else": "2023-03-03"}, "s": ["2023-03-03"], "x": False, "y": 123}],
28
+ {},
29
+ ),
30
+ ("format_1", [{"d": "2022-02-03"}], {"d": "%Y-%m-%d"}),
31
+ ("format_2", [{"d": "2022-02-03 12:34:56"}], {"d": "%Y-%m-%d %H:%M:%S"}),
32
+ ("format_3", [{"d": "2022-02-03 12:34:56.123456+00:00"}], {"d": "%Y-%m-%d %H:%M:%S.%f+00:00"}),
33
+ ("format_4", [{"d": "2022-02-03T12:34:56.123456+0000"}], {"d": "%Y-%m-%dT%H:%M:%S.%f%z"}),
34
+ ("format_4 2", [{"d": "2022-02-03T12:34:56.000Z"}], {"d": "%Y-%m-%dT%H:%M:%S.%f%z"}),
35
+ ("format_4 2", [{"d": "2022-02-03T12:34:56.000000Z"}], {"d": "%Y-%m-%dT%H:%M:%S.%f%z"}),
36
+ ("format_6", [{"d": "03/02/2022 12:34"}], {"d": "%d/%m/%Y %H:%M"}),
37
+ ("format_7", [{"d": "2022-02"}], {"d": "%Y-%m"}),
38
+ ("format_8", [{"d": "03-02-2022"}], {"d": "%d-%m-%Y"}),
39
+ ("limit_down", [{"d": "2022-02-03", "x": "2022-02-03"}, {"d": "2022-02-03", "x": "another thing"}], {"d": "%Y-%m-%d"}),
40
+ ("limit_down all", [{"d": "2022-02-03", "x": "2022-02-03"}, {"d": "also another thing", "x": "another thing"}], {}),
41
+ ("limit_down empty", [{"d": "2022-02-03", "x": "2022-02-03"}, {}], {}),
42
+ ("limit_down unsupported type", [{"d": "2022-02-03"}, {"d": False}], {}),
43
+ ("limit_down complex type", [{"d": "2022-02-03"}, {"d": {"date": "2022-03-03"}}], {}),
44
+ ("limit_down different format", [{"d": "2022-02-03"}, {"d": 1686058051}], {}),
45
+ ("limit_down different format", [{"d": "2022-02-03"}, {"d": "2022-02-03T12:34:56.000000Z"}], {}),
46
+ ("no scope expand", [{}, {"d": "2022-02-03"}], {}),
47
+ ],
48
+ )
49
+ def test_schema_inferrer(test_name, input_records: List, expected_candidate_fields: Dict[str, str]):
50
+ inferrer = DatetimeFormatInferrer()
51
+ for record in input_records:
52
+ inferrer.accumulate(AirbyteRecordMessage(stream="abc", data=record, emitted_at=NOW))
53
+ assert inferrer.get_inferred_datetime_formats() == expected_candidate_fields