airbyte-cdk 0.40.1__py3-none-any.whl → 0.40.3__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,6 +14,7 @@ from airbyte_cdk.entrypoint import AirbyteEntrypoint
14
14
  from airbyte_cdk.sources import AbstractSource
15
15
  from airbyte_cdk.sources.declarative.declarative_source import DeclarativeSource
16
16
  from airbyte_cdk.utils import AirbyteTracedException
17
+ from airbyte_cdk.utils.datetime_format_inferrer import DatetimeFormatInferrer
17
18
  from airbyte_cdk.utils.schema_inferrer import SchemaInferrer
18
19
  from airbyte_protocol.models.airbyte_protocol import (
19
20
  AirbyteControlMessage,
@@ -46,6 +47,7 @@ class MessageGrouper:
46
47
  if record_limit is not None and not (1 <= record_limit <= 1000):
47
48
  raise ValueError(f"Record limit must be between 1 and 1000. Got {record_limit}")
48
49
  schema_inferrer = SchemaInferrer()
50
+ datetime_format_inferrer = DatetimeFormatInferrer()
49
51
 
50
52
  if record_limit is None:
51
53
  record_limit = self._max_record_limit
@@ -58,6 +60,7 @@ class MessageGrouper:
58
60
  for message_group in self._get_message_groups(
59
61
  self._read_stream(source, config, configured_catalog),
60
62
  schema_inferrer,
63
+ datetime_format_inferrer,
61
64
  record_limit,
62
65
  ):
63
66
  if isinstance(message_group, AirbyteLogMessage):
@@ -80,10 +83,11 @@ class MessageGrouper:
80
83
  configured_catalog.streams[0].stream.name
81
84
  ), # The connector builder currently only supports reading from a single stream at a time
82
85
  latest_config_update=latest_config_update.connectorConfig.config if latest_config_update else self._clean_config(config),
86
+ inferred_datetime_formats=datetime_format_inferrer.get_inferred_datetime_formats(),
83
87
  )
84
88
 
85
89
  def _get_message_groups(
86
- self, messages: Iterator[AirbyteMessage], schema_inferrer: SchemaInferrer, limit: int
90
+ self, messages: Iterator[AirbyteMessage], schema_inferrer: SchemaInferrer, datetime_format_inferrer: DatetimeFormatInferrer, limit: int
87
91
  ) -> Iterable[Union[StreamReadPages, AirbyteControlMessage, AirbyteLogMessage, AirbyteTraceMessage]]:
88
92
  """
89
93
  Message groups are partitioned according to when request log messages are received. Subsequent response log messages
@@ -141,6 +145,7 @@ class MessageGrouper:
141
145
  current_page_records.append(message.record.data)
142
146
  records_count += 1
143
147
  schema_inferrer.accumulate(message.record)
148
+ datetime_format_inferrer.accumulate(message.record)
144
149
  elif message.type == MessageType.CONTROL and message.control.type == OrchestratorType.CONNECTOR_CONFIG:
145
150
  yield message.control
146
151
  else:
@@ -48,6 +48,7 @@ class StreamRead(object):
48
48
  slices: List[StreamReadSlices]
49
49
  test_read_limit_reached: bool
50
50
  inferred_schema: Optional[Dict[str, Any]]
51
+ inferred_datetime_formats: Optional[Dict[str, str]]
51
52
  latest_config_update: Optional[Dict[str, Any]]
52
53
 
53
54
 
@@ -27,6 +27,10 @@ properties:
27
27
  type: object
28
28
  spec:
29
29
  "$ref": "#/definitions/Spec"
30
+ metadata:
31
+ type: object
32
+ description: For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata.
33
+ additionalProperties: true
30
34
  additionalProperties: false
31
35
  definitions:
32
36
  AddedFieldDefinition:
@@ -1095,6 +1095,10 @@ class DeclarativeSource(BaseModel):
1095
1095
  schemas: Optional[Schemas] = None
1096
1096
  definitions: Optional[Dict[str, Any]] = None
1097
1097
  spec: Optional[Spec] = None
1098
+ metadata: Optional[Dict[str, Any]] = Field(
1099
+ None,
1100
+ description="For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata.",
1101
+ )
1098
1102
 
1099
1103
 
1100
1104
  class DeclarativeStream(BaseModel):
@@ -0,0 +1,80 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from typing import Any, Dict, Union
6
+
7
+ from airbyte_cdk.models import AirbyteRecordMessage
8
+ from airbyte_cdk.sources.declarative.datetime.datetime_parser import DatetimeParser
9
+
10
+
11
+ class DatetimeFormatInferrer:
12
+ """
13
+ This class is used to detect toplevel fields in records that might be datetime values, along with the used format.
14
+ """
15
+
16
+ def __init__(self):
17
+ self._parser = DatetimeParser()
18
+ self._datetime_candidates: Union[None, Dict[str, str]] = None
19
+ self._formats = [
20
+ "%Y-%m-%d",
21
+ "%Y-%m-%d %H:%M:%S",
22
+ "%Y-%m-%d %H:%M:%S.%f+00:00",
23
+ "%Y-%m-%dT%H:%M:%S.%f%z",
24
+ "%s",
25
+ "%d/%m/%Y %H:%M",
26
+ "%Y-%m",
27
+ "%d-%m-%Y",
28
+ "%Y-%m-%dT%H:%M:%SZ",
29
+ ]
30
+ self._timestamp_heuristic_range = range(1_000_000_000, 2_000_000_000)
31
+
32
+ def _can_be_datetime(self, value: Any) -> bool:
33
+ """Checks if the value can be a datetime. This is the case if the value is a string or an integer between 1_000_000_000 and 2_000_000_000. This is separate from the format check for performance reasons"""
34
+ if isinstance(value, str) and (not value.isdecimal() or int(value) in self._timestamp_heuristic_range):
35
+ return True
36
+ if isinstance(value, int) and value in self._timestamp_heuristic_range:
37
+ return True
38
+ return False
39
+
40
+ def _matches_format(self, value: Any, format: str) -> bool:
41
+ """Checks if the value matches the format"""
42
+ try:
43
+ self._parser.parse(value, format)
44
+ return True
45
+ except ValueError:
46
+ return False
47
+
48
+ def _initialize(self, record: AirbyteRecordMessage):
49
+ """Initializes the internal state of the class"""
50
+ self._datetime_candidates = {}
51
+ for field_name, field_value in record.data.items():
52
+ if not self._can_be_datetime(field_value):
53
+ continue
54
+ for format in self._formats:
55
+ if self._matches_format(field_value, format):
56
+ self._datetime_candidates[field_name] = format
57
+ break
58
+
59
+ def _validate(self, record: AirbyteRecordMessage):
60
+ """Validates that the record is consistent with the inferred datetime formats"""
61
+ for candidate_field_name in list(self._datetime_candidates.keys()):
62
+ candidate_field_format = self._datetime_candidates[candidate_field_name]
63
+ current_value = record.data.get(candidate_field_name, None)
64
+ if (
65
+ current_value is None
66
+ or not self._can_be_datetime(current_value)
67
+ or not self._matches_format(current_value, candidate_field_format)
68
+ ):
69
+ self._datetime_candidates.pop(candidate_field_name)
70
+
71
+ def accumulate(self, record: AirbyteRecordMessage):
72
+ """Analyzes the record and updates the internal state of candidate datetime fields"""
73
+ self._initialize(record) if self._datetime_candidates is None else self._validate(record)
74
+
75
+ def get_inferred_datetime_formats(self) -> Dict[str, str]:
76
+ """
77
+ Returns the list of candidate datetime fields - the keys are the field names and the values are the inferred datetime formats.
78
+ For these fields the format was consistent across all visited records.
79
+ """
80
+ return self._datetime_candidates or {}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.40.1
3
+ Version: 0.40.3
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -8,8 +8,8 @@ airbyte_cdk/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  airbyte_cdk/connector_builder/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
9
9
  airbyte_cdk/connector_builder/connector_builder_handler.py,sha256=q8mqQjNqpvHZgwVbNuvSe19o4Aw6MQTuhA2URmdz0K0,5443
10
10
  airbyte_cdk/connector_builder/main.py,sha256=jn2gqaYAvd6uDoFe0oVhnY23grm5sL-jfIX6kGvhVxk,2994
11
- airbyte_cdk/connector_builder/message_grouper.py,sha256=uJGOBhinvbisgAa-bQN3XE2L2xFTeVeykLwDCRYcxgc,12110
12
- airbyte_cdk/connector_builder/models.py,sha256=yW_j91B-3FYNTNbWjR2ZVYTXBHlskT55uxdAqg7FhAE,1221
11
+ airbyte_cdk/connector_builder/message_grouper.py,sha256=yEjvwdXgzYK29xwjl88-4s-J49iaud8_aOrAlOkAzsg,12504
12
+ airbyte_cdk/connector_builder/models.py,sha256=jL2SJIWJTLCbBqobw5Qo8WGS0aN-K9TRmfSpDHM5vYc,1277
13
13
  airbyte_cdk/destinations/__init__.py,sha256=0Uxmz3iBAyZJdk_bqUVt2pb0UwRTpFjTnFE6fQFbWKY,126
14
14
  airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQwQj8MyVm3BI,5420
15
15
  airbyte_cdk/models/__init__.py,sha256=LPQcYdDPwrCXiBPe_jexO4UAcbovIb1V9tHB6I7Un30,633
@@ -22,7 +22,7 @@ airbyte_cdk/sources/connector_state_manager.py,sha256=_R-2QnMGimKL0t5aV4f6P1dgd-
22
22
  airbyte_cdk/sources/source.py,sha256=N3vHZzdUsBETFsql-YpO-LcgjolT_jcnAuHBhGD6Hqk,4278
23
23
  airbyte_cdk/sources/declarative/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
24
24
  airbyte_cdk/sources/declarative/create_partial.py,sha256=sUJOwD8hBzW4pxw2XhYlSTMgl-WMc5WpP5Oq_jo3fHw,3371
25
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=92f7DKAim9qPMdAh4-vgFTIIGCb1Fz0D--nNpZUSDo4,73990
25
+ airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=-Kt09XCMs61gEphShtPTMGrqVAamr4cml03_YjDuTLQ,74196
26
26
  airbyte_cdk/sources/declarative/declarative_source.py,sha256=U2As9PDKmcWDgbsWUo-RetJ9fxQOBlwntWZ0NOgs5Ac,1453
27
27
  airbyte_cdk/sources/declarative/declarative_stream.py,sha256=0iZSpypxt8bhO3Lmf3BpGRTO7Fp0Q2GI8m8xyJJUjeM,6580
28
28
  airbyte_cdk/sources/declarative/exceptions.py,sha256=kTPUA4I2NV4J6HDz-mKPGMrfuc592akJnOyYx38l_QM,176
@@ -60,7 +60,7 @@ airbyte_cdk/sources/declarative/interpolation/interpolation.py,sha256=dyIM-bzh54
60
60
  airbyte_cdk/sources/declarative/interpolation/jinja.py,sha256=Dc0F87nElWsz_Ikj938eQ9uqZvyqgFhZ8Dqf_-hvndc,4800
61
61
  airbyte_cdk/sources/declarative/interpolation/macros.py,sha256=V6WGKJ9cXX1rjuM4bK3Cs9xEryMlkY2U3FMsSBhrgC8,3098
62
62
  airbyte_cdk/sources/declarative/models/__init__.py,sha256=EiYnzwCHZV7EYqMJqcy6xKSeHvTKZBsQndjbEwmiTW4,93
63
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=FnDt_AFbXConXGvWEs3gCpNq7wACUox3u2z6Lpr2lBI,49884
63
+ airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=7XeAhmGHuNRYK97KwxvbrNXS1Az95O7gOMM3uRlGjrU,50104
64
64
  airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
65
65
  airbyte_cdk/sources/declarative/parsers/class_types_registry.py,sha256=bK4a74opm6WHyV7HqOVws6GE5Z7cLNc5MaTha69abIQ,6086
66
66
  airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=y7_G5mM07zxT5YG975kdC2PAja-Uc83pYp8WrV3GNdo,522
@@ -156,6 +156,7 @@ airbyte_cdk/sources/utils/schema_models.py,sha256=m1vOqNkkVYGblc492wKo11Zm5FK9F0
156
156
  airbyte_cdk/sources/utils/transform.py,sha256=4GYmO6bq33HF-a1in0dKQKqUOYI1bWItyuYF875bSQg,9493
157
157
  airbyte_cdk/utils/__init__.py,sha256=kFLcs2P-tbPyeVOJS9rOv1jZdnSpjG24ro0CHgt_CIk,215
158
158
  airbyte_cdk/utils/airbyte_secrets_utils.py,sha256=q3aDl8T10ufGbeqnUPqbZLxQcHdkf2kDfQK_upWzBbI,2894
159
+ airbyte_cdk/utils/datetime_format_inferrer.py,sha256=1z5lGq_DI9LFrT68ftlJSqndS6i-Rs1PX7T_RBtOJpA,3443
159
160
  airbyte_cdk/utils/event_timing.py,sha256=Hn5kCc9xGKLcV5EYpJCZwNiz9neKKu2WG8FJF_hy278,2377
160
161
  airbyte_cdk/utils/schema_inferrer.py,sha256=j0us_mEMj8PVVzSZfoS1adK7V7a--mSHQozo6xmsiIc,3720
161
162
  airbyte_cdk/utils/stream_status_utils.py,sha256=X1Vy7BhglycjdIWpfKDfwJussNCxYffelKt6Utjx-qY,1005
@@ -163,8 +164,8 @@ airbyte_cdk/utils/traced_exception.py,sha256=9G2sG9eYkvn6Aa7rMuUW_KIRszRaTc_xdnT
163
164
  source_declarative_manifest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
164
165
  source_declarative_manifest/main.py,sha256=HXzuRsRyhHwPrGU-hc4S7RrgoOoHImqkdfbmO2geBeE,1027
165
166
  unit_tests/connector_builder/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
166
- unit_tests/connector_builder/test_connector_builder_handler.py,sha256=V9p7AFECaLqSK-iGvu0OqwV6qREQC2BhWo0H4OoiiK4,26895
167
- unit_tests/connector_builder/test_message_grouper.py,sha256=XMVRW45RDTgy1YVzkV-jOXj7Ar2mzgDV8OW2QDzZjYU,28510
167
+ unit_tests/connector_builder/test_connector_builder_handler.py,sha256=UtGSzZshZeWZcc5lt3Kt6-8aDFFwj2sLvzjCBfPkrkg,27054
168
+ unit_tests/connector_builder/test_message_grouper.py,sha256=Rek2qmuexLtfsQmHEUR_7FH-eDg3CnFiOOWVUgB9ow8,28802
168
169
  unit_tests/connector_builder/utils.py,sha256=AAggdGWP-mNuWOZUHLAVIbjTeIcdPo-3pbMm5zdYpS0,796
169
170
  unit_tests/destinations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
170
171
  unit_tests/destinations/test_destination.py,sha256=koG_j812KMkcIxoUH6XlAL3zsephZJmlHvyzJXm0dCs,10269
@@ -258,12 +259,13 @@ unit_tests/sources/streams/http/auth/test_auth.py,sha256=gdWpJ-cR64qRXmmPOQWhVd4
258
259
  unit_tests/sources/streams/http/requests_native_auth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
259
260
  unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py,sha256=_BZVsG_LZUXfBmHWTlKIw65eGkdwFSiKRlpjsccj61U,12396
260
261
  unit_tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
262
+ unit_tests/utils/test_datetime_format_inferrer.py,sha256=Io2o5flTre9gyI_IDDMpzxOjCz3sr16LO0GRqOD59uk,2946
261
263
  unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg4MNPAG-xhpk,7817
262
264
  unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
263
265
  unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
264
266
  unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
265
- airbyte_cdk-0.40.1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
266
- airbyte_cdk-0.40.1.dist-info/METADATA,sha256=BAymaIj0GzeNsxop-ZY5Pm5EHnXdsCtRNGEojqpAHWM,8902
267
- airbyte_cdk-0.40.1.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
268
- airbyte_cdk-0.40.1.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
269
- airbyte_cdk-0.40.1.dist-info/RECORD,,
267
+ airbyte_cdk-0.40.3.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
268
+ airbyte_cdk-0.40.3.dist-info/METADATA,sha256=pAfHdGCbN9Iz4q4xcnO3z3sATNNzWz4h7KX5eUQGq1I,8902
269
+ airbyte_cdk-0.40.3.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
270
+ airbyte_cdk-0.40.3.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
271
+ airbyte_cdk-0.40.3.dist-info/RECORD,,
@@ -354,6 +354,7 @@ def test_read():
354
354
  ],
355
355
  test_read_limit_reached=False,
356
356
  inferred_schema=None,
357
+ inferred_datetime_formats=None,
357
358
  latest_config_update={}
358
359
  )
359
360
 
@@ -368,6 +369,7 @@ def test_read():
368
369
  ],
369
370
  "test_read_limit_reached": False,
370
371
  "inferred_schema": None,
372
+ "inferred_datetime_formats": None,
371
373
  "latest_config_update": {}
372
374
  },
373
375
  emitted_at=1,
@@ -410,6 +412,7 @@ def test_read_returns_error_response(mock_from_exception):
410
412
  slice_descriptor=None, state=None)],
411
413
  test_read_limit_reached=False,
412
414
  inferred_schema=None,
415
+ inferred_datetime_formats={},
413
416
  latest_config_update={})
414
417
 
415
418
  expected_message = AirbyteMessage(
@@ -94,7 +94,8 @@ def test_get_grouped_messages(mock_entrypoint_read):
94
94
  "body": {"custom": "field"},
95
95
  }
96
96
  response = {"status_code": 200, "headers": {"field": "value"}, "body": '{"name": "field"}', "http_method": "GET"}
97
- expected_schema = {"$schema": "http://json-schema.org/schema#", "properties": {"name": {"type": "string"}}, "type": "object"}
97
+ expected_schema = {"$schema": "http://json-schema.org/schema#", "properties": {"name": {"type": "string"}, "date": {"type": "string"}}, "type": "object"}
98
+ expected_datetime_fields = {"date":"%Y-%m-%d"}
98
99
  expected_pages = [
99
100
  StreamReadPages(
100
101
  request=HttpRequest(
@@ -105,7 +106,7 @@ def test_get_grouped_messages(mock_entrypoint_read):
105
106
  http_method="GET",
106
107
  ),
107
108
  response=HttpResponse(status=200, headers={"field": "value"}, body='{"name": "field"}'),
108
- records=[{"name": "Shinobu Kocho"}, {"name": "Muichiro Tokito"}],
109
+ records=[{"name": "Shinobu Kocho", "date": "2023-03-03"}, {"name": "Muichiro Tokito", "date": "2023-03-04"}],
109
110
  ),
110
111
  StreamReadPages(
111
112
  request=HttpRequest(
@@ -116,7 +117,7 @@ def test_get_grouped_messages(mock_entrypoint_read):
116
117
  http_method="GET",
117
118
  ),
118
119
  response=HttpResponse(status=200, headers={"field": "value"}, body='{"name": "field"}'),
119
- records=[{"name": "Mitsuri Kanroji"}],
120
+ records=[{"name": "Mitsuri Kanroji", "date": "2023-03-05"}],
120
121
  ),
121
122
  ]
122
123
 
@@ -124,11 +125,11 @@ def test_get_grouped_messages(mock_entrypoint_read):
124
125
  [
125
126
  request_log_message(request),
126
127
  response_log_message(response),
127
- record_message("hashiras", {"name": "Shinobu Kocho"}),
128
- record_message("hashiras", {"name": "Muichiro Tokito"}),
128
+ record_message("hashiras", {"name": "Shinobu Kocho", "date": "2023-03-03"}),
129
+ record_message("hashiras", {"name": "Muichiro Tokito", "date": "2023-03-04"}),
129
130
  request_log_message(request),
130
131
  response_log_message(response),
131
- record_message("hashiras", {"name": "Mitsuri Kanroji"}),
132
+ record_message("hashiras", {"name": "Mitsuri Kanroji", "date": "2023-03-05"}),
132
133
  ]
133
134
  ))
134
135
 
@@ -138,6 +139,7 @@ def test_get_grouped_messages(mock_entrypoint_read):
138
139
  )
139
140
 
140
141
  assert actual_response.inferred_schema == expected_schema
142
+ assert actual_response.inferred_datetime_formats == expected_datetime_fields
141
143
 
142
144
  single_slice = actual_response.slices[0]
143
145
  for i, actual_page in enumerate(single_slice.pages):
@@ -0,0 +1,53 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from typing import Dict, List
6
+
7
+ import pytest
8
+ from airbyte_cdk.models.airbyte_protocol import AirbyteRecordMessage
9
+ from airbyte_cdk.utils.datetime_format_inferrer import DatetimeFormatInferrer
10
+
11
+ NOW = 1234567
12
+
13
+
14
+ @pytest.mark.parametrize(
15
+ "test_name,input_records,expected_candidate_fields",
16
+ [
17
+ ("empty", [], {}),
18
+ ("simple_match", [{"d": "2022-02-03"}], {"d": "%Y-%m-%d"}),
19
+ ("timestamp_match_integer", [{"d": 1686058051}], {"d": "%s"}),
20
+ ("timestamp_match_string", [{"d": "1686058051"}], {"d": "%s"}),
21
+ ("timestamp_no_match_integer", [{"d": 99}], {}),
22
+ ("timestamp_no_match_string", [{"d": "99999999999999999999"}], {}),
23
+ ("simple_no_match", [{"d": "20220203"}], {}),
24
+ ("multiple_match", [{"d": "2022-02-03", "e": "2022-02-03"}], {"d": "%Y-%m-%d", "e": "%Y-%m-%d"}),
25
+ (
26
+ "multiple_no_match",
27
+ [{"d": "20220203", "r": "ccc", "e": {"something-else": "2023-03-03"}, "s": ["2023-03-03"], "x": False, "y": 123}],
28
+ {},
29
+ ),
30
+ ("format_1", [{"d": "2022-02-03"}], {"d": "%Y-%m-%d"}),
31
+ ("format_2", [{"d": "2022-02-03 12:34:56"}], {"d": "%Y-%m-%d %H:%M:%S"}),
32
+ ("format_3", [{"d": "2022-02-03 12:34:56.123456+00:00"}], {"d": "%Y-%m-%d %H:%M:%S.%f+00:00"}),
33
+ ("format_4", [{"d": "2022-02-03T12:34:56.123456+0000"}], {"d": "%Y-%m-%dT%H:%M:%S.%f%z"}),
34
+ ("format_4 2", [{"d": "2022-02-03T12:34:56.000Z"}], {"d": "%Y-%m-%dT%H:%M:%S.%f%z"}),
35
+ ("format_4 2", [{"d": "2022-02-03T12:34:56.000000Z"}], {"d": "%Y-%m-%dT%H:%M:%S.%f%z"}),
36
+ ("format_6", [{"d": "03/02/2022 12:34"}], {"d": "%d/%m/%Y %H:%M"}),
37
+ ("format_7", [{"d": "2022-02"}], {"d": "%Y-%m"}),
38
+ ("format_8", [{"d": "03-02-2022"}], {"d": "%d-%m-%Y"}),
39
+ ("limit_down", [{"d": "2022-02-03", "x": "2022-02-03"}, {"d": "2022-02-03", "x": "another thing"}], {"d": "%Y-%m-%d"}),
40
+ ("limit_down all", [{"d": "2022-02-03", "x": "2022-02-03"}, {"d": "also another thing", "x": "another thing"}], {}),
41
+ ("limit_down empty", [{"d": "2022-02-03", "x": "2022-02-03"}, {}], {}),
42
+ ("limit_down unsupported type", [{"d": "2022-02-03"}, {"d": False}], {}),
43
+ ("limit_down complex type", [{"d": "2022-02-03"}, {"d": {"date": "2022-03-03"}}], {}),
44
+ ("limit_down different format", [{"d": "2022-02-03"}, {"d": 1686058051}], {}),
45
+ ("limit_down different format", [{"d": "2022-02-03"}, {"d": "2022-02-03T12:34:56.000000Z"}], {}),
46
+ ("no scope expand", [{}, {"d": "2022-02-03"}], {}),
47
+ ],
48
+ )
49
+ def test_schema_inferrer(test_name, input_records: List, expected_candidate_fields: Dict[str, str]):
50
+ inferrer = DatetimeFormatInferrer()
51
+ for record in input_records:
52
+ inferrer.accumulate(AirbyteRecordMessage(stream="abc", data=record, emitted_at=NOW))
53
+ assert inferrer.get_inferred_datetime_formats() == expected_candidate_fields