airbyte-cdk 0.72.1__py3-none-any.whl → 6.17.1.dev1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (518) hide show
  1. airbyte_cdk/__init__.py +355 -6
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +29 -10
  7. airbyte_cdk/connector.py +24 -24
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
  10. airbyte_cdk/connector_builder/main.py +45 -13
  11. airbyte_cdk/connector_builder/message_grouper.py +189 -50
  12. airbyte_cdk/connector_builder/models.py +3 -2
  13. airbyte_cdk/destinations/__init__.py +4 -3
  14. airbyte_cdk/destinations/destination.py +54 -20
  15. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  16. airbyte_cdk/destinations/vector_db_based/config.py +40 -17
  17. airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
  18. airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
  19. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  20. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  21. airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
  22. airbyte_cdk/entrypoint.py +153 -44
  23. airbyte_cdk/exception_handler.py +21 -3
  24. airbyte_cdk/logger.py +30 -44
  25. airbyte_cdk/models/__init__.py +13 -2
  26. airbyte_cdk/models/airbyte_protocol.py +86 -1
  27. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  28. airbyte_cdk/models/file_transfer_record_message.py +13 -0
  29. airbyte_cdk/models/well_known_types.py +1 -1
  30. airbyte_cdk/sources/__init__.py +5 -1
  31. airbyte_cdk/sources/abstract_source.py +125 -79
  32. airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
  33. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
  34. airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
  35. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
  36. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  37. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
  38. airbyte_cdk/sources/config.py +3 -2
  39. airbyte_cdk/sources/connector_state_manager.py +49 -83
  40. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  41. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
  42. airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
  43. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  44. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  45. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  46. airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
  47. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  48. airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
  49. airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
  50. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
  51. airbyte_cdk/sources/declarative/auth/token.py +28 -10
  52. airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
  53. airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
  54. airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
  55. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  56. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  57. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +490 -0
  58. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
  59. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
  60. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1185 -85
  61. airbyte_cdk/sources/declarative/declarative_source.py +5 -2
  62. airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
  63. airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
  64. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  65. airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
  66. airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
  67. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  68. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  69. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  70. airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
  71. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
  72. airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
  73. airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
  74. airbyte_cdk/sources/declarative/extractors/record_filter.py +63 -8
  75. airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
  76. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
  77. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  78. airbyte_cdk/sources/declarative/incremental/__init__.py +31 -3
  79. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +340 -0
  80. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
  81. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  82. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
  83. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +174 -74
  84. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  85. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  86. airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
  87. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
  88. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
  89. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
  90. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
  91. airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
  92. airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
  93. airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
  94. airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
  95. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  96. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  97. airbyte_cdk/sources/declarative/models/__init__.py +1 -1
  98. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1319 -603
  99. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
  100. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
  101. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
  102. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1759 -225
  103. airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
  104. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  105. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  106. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
  107. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  108. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
  109. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
  110. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  111. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  112. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
  113. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
  114. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
  115. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
  116. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
  117. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
  118. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
  119. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
  120. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  121. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
  122. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
  123. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
  124. airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
  125. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  126. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
  127. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
  128. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
  129. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  130. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
  131. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
  132. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
  133. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
  134. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
  135. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
  136. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
  137. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  138. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
  139. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
  140. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
  141. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
  142. airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
  143. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  144. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  145. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  146. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  147. airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
  148. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
  149. airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
  150. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +229 -73
  151. airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
  152. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
  153. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
  154. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
  155. airbyte_cdk/sources/declarative/spec/spec.py +12 -5
  156. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
  157. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
  158. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
  159. airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
  160. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  161. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  162. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  163. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  164. airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
  165. airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
  166. airbyte_cdk/sources/declarative/types.py +19 -110
  167. airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
  168. airbyte_cdk/sources/embedded/base_integration.py +16 -5
  169. airbyte_cdk/sources/embedded/catalog.py +16 -4
  170. airbyte_cdk/sources/embedded/runner.py +19 -3
  171. airbyte_cdk/sources/embedded/tools.py +5 -2
  172. airbyte_cdk/sources/file_based/README.md +152 -0
  173. airbyte_cdk/sources/file_based/__init__.py +24 -0
  174. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  175. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
  176. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
  177. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +47 -10
  178. airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
  179. airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
  180. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  181. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
  182. airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
  183. airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
  184. airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
  185. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  186. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  187. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  188. airbyte_cdk/sources/file_based/exceptions.py +18 -15
  189. airbyte_cdk/sources/file_based/file_based_source.py +140 -33
  190. airbyte_cdk/sources/file_based/file_based_stream_reader.py +69 -5
  191. airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
  192. airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
  193. airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
  194. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  195. airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
  196. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  197. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
  198. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
  199. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +141 -41
  200. airbyte_cdk/sources/file_based/remote_file.py +1 -1
  201. airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
  202. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  203. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  204. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  205. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
  206. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
  207. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  208. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
  209. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
  210. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
  211. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  212. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
  213. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +147 -45
  214. airbyte_cdk/sources/http_logger.py +8 -3
  215. airbyte_cdk/sources/message/__init__.py +7 -1
  216. airbyte_cdk/sources/message/repository.py +18 -4
  217. airbyte_cdk/sources/source.py +42 -38
  218. airbyte_cdk/sources/streams/__init__.py +2 -2
  219. airbyte_cdk/sources/streams/availability_strategy.py +54 -3
  220. airbyte_cdk/sources/streams/call_rate.py +64 -21
  221. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  222. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  223. airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
  224. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  225. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  226. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  227. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  228. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
  229. airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
  230. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
  231. airbyte_cdk/sources/streams/concurrent/cursor.py +313 -48
  232. airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
  233. airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
  234. airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
  235. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
  236. airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
  237. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
  238. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  239. airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
  240. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
  241. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
  242. airbyte_cdk/sources/streams/core.py +412 -87
  243. airbyte_cdk/sources/streams/http/__init__.py +2 -1
  244. airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
  245. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  246. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  247. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  248. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  249. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  250. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  251. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  252. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  253. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  254. airbyte_cdk/sources/streams/http/exceptions.py +27 -7
  255. airbyte_cdk/sources/streams/http/http.py +369 -246
  256. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  257. airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
  258. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
  259. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  260. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
  261. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  262. airbyte_cdk/sources/types.py +154 -0
  263. airbyte_cdk/sources/utils/record_helper.py +36 -21
  264. airbyte_cdk/sources/utils/schema_helpers.py +13 -6
  265. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  266. airbyte_cdk/sources/utils/transform.py +54 -20
  267. airbyte_cdk/sql/_util/hashing.py +34 -0
  268. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  269. airbyte_cdk/sql/constants.py +32 -0
  270. airbyte_cdk/sql/exceptions.py +235 -0
  271. airbyte_cdk/sql/secrets.py +123 -0
  272. airbyte_cdk/sql/shared/__init__.py +15 -0
  273. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  274. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  275. airbyte_cdk/sql/types.py +160 -0
  276. airbyte_cdk/test/catalog_builder.py +70 -18
  277. airbyte_cdk/test/entrypoint_wrapper.py +117 -42
  278. airbyte_cdk/test/mock_http/__init__.py +1 -1
  279. airbyte_cdk/test/mock_http/matcher.py +6 -0
  280. airbyte_cdk/test/mock_http/mocker.py +57 -10
  281. airbyte_cdk/test/mock_http/request.py +19 -3
  282. airbyte_cdk/test/mock_http/response.py +3 -1
  283. airbyte_cdk/test/mock_http/response_builder.py +32 -16
  284. airbyte_cdk/test/state_builder.py +18 -10
  285. airbyte_cdk/test/utils/__init__.py +1 -0
  286. airbyte_cdk/test/utils/data.py +24 -0
  287. airbyte_cdk/test/utils/http_mocking.py +16 -0
  288. airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
  289. airbyte_cdk/test/utils/reading.py +26 -0
  290. airbyte_cdk/utils/__init__.py +2 -1
  291. airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
  292. airbyte_cdk/utils/analytics_message.py +10 -2
  293. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  294. airbyte_cdk/utils/event_timing.py +10 -10
  295. airbyte_cdk/utils/mapping_helpers.py +3 -1
  296. airbyte_cdk/utils/message_utils.py +20 -11
  297. airbyte_cdk/utils/print_buffer.py +75 -0
  298. airbyte_cdk/utils/schema_inferrer.py +198 -28
  299. airbyte_cdk/utils/slice_hasher.py +30 -0
  300. airbyte_cdk/utils/spec_schema_transformations.py +6 -3
  301. airbyte_cdk/utils/stream_status_utils.py +8 -1
  302. airbyte_cdk/utils/traced_exception.py +61 -21
  303. airbyte_cdk-6.17.1.dev1.dist-info/METADATA +109 -0
  304. airbyte_cdk-6.17.1.dev1.dist-info/RECORD +350 -0
  305. {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.17.1.dev1.dist-info}/WHEEL +1 -2
  306. airbyte_cdk-6.17.1.dev1.dist-info/entry_points.txt +3 -0
  307. airbyte_cdk/sources/declarative/create_partial.py +0 -92
  308. airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
  309. airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
  310. airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
  311. airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
  312. airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
  313. airbyte_cdk/sources/deprecated/base_source.py +0 -94
  314. airbyte_cdk/sources/deprecated/client.py +0 -99
  315. airbyte_cdk/sources/singer/__init__.py +0 -8
  316. airbyte_cdk/sources/singer/singer_helpers.py +0 -304
  317. airbyte_cdk/sources/singer/source.py +0 -186
  318. airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
  319. airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
  320. airbyte_cdk/sources/streams/http/auth/core.py +0 -29
  321. airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
  322. airbyte_cdk/sources/streams/http/auth/token.py +0 -47
  323. airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
  324. airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
  325. airbyte_cdk/sources/utils/schema_models.py +0 -84
  326. airbyte_cdk-0.72.1.dist-info/METADATA +0 -243
  327. airbyte_cdk-0.72.1.dist-info/RECORD +0 -466
  328. airbyte_cdk-0.72.1.dist-info/top_level.txt +0 -3
  329. source_declarative_manifest/main.py +0 -29
  330. unit_tests/connector_builder/__init__.py +0 -3
  331. unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
  332. unit_tests/connector_builder/test_message_grouper.py +0 -713
  333. unit_tests/connector_builder/utils.py +0 -27
  334. unit_tests/destinations/test_destination.py +0 -243
  335. unit_tests/singer/test_singer_helpers.py +0 -56
  336. unit_tests/singer/test_singer_source.py +0 -112
  337. unit_tests/sources/__init__.py +0 -0
  338. unit_tests/sources/concurrent_source/__init__.py +0 -3
  339. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
  340. unit_tests/sources/declarative/__init__.py +0 -3
  341. unit_tests/sources/declarative/auth/__init__.py +0 -3
  342. unit_tests/sources/declarative/auth/test_oauth.py +0 -331
  343. unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
  344. unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
  345. unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
  346. unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
  347. unit_tests/sources/declarative/checks/__init__.py +0 -3
  348. unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
  349. unit_tests/sources/declarative/decoders/__init__.py +0 -0
  350. unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
  351. unit_tests/sources/declarative/external_component.py +0 -13
  352. unit_tests/sources/declarative/extractors/__init__.py +0 -3
  353. unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
  354. unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
  355. unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
  356. unit_tests/sources/declarative/incremental/__init__.py +0 -0
  357. unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
  358. unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
  359. unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
  360. unit_tests/sources/declarative/interpolation/__init__.py +0 -3
  361. unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
  362. unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
  363. unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
  364. unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
  365. unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
  366. unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
  367. unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
  368. unit_tests/sources/declarative/parsers/__init__.py +0 -3
  369. unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
  370. unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
  371. unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1847
  372. unit_tests/sources/declarative/parsers/testing_components.py +0 -36
  373. unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
  374. unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
  375. unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
  376. unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
  377. unit_tests/sources/declarative/requesters/__init__.py +0 -3
  378. unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
  379. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
  380. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
  381. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
  382. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
  383. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
  384. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
  385. unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
  386. unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
  387. unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
  388. unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
  389. unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
  390. unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
  391. unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
  392. unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
  393. unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
  394. unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
  395. unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
  396. unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
  397. unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
  398. unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
  399. unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
  400. unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
  401. unit_tests/sources/declarative/retrievers/__init__.py +0 -3
  402. unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
  403. unit_tests/sources/declarative/schema/__init__.py +0 -6
  404. unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
  405. unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
  406. unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
  407. unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
  408. unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
  409. unit_tests/sources/declarative/states/__init__.py +0 -3
  410. unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
  411. unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
  412. unit_tests/sources/declarative/test_create_partial.py +0 -83
  413. unit_tests/sources/declarative/test_declarative_stream.py +0 -103
  414. unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
  415. unit_tests/sources/declarative/test_types.py +0 -39
  416. unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
  417. unit_tests/sources/file_based/__init__.py +0 -0
  418. unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
  419. unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
  420. unit_tests/sources/file_based/config/__init__.py +0 -0
  421. unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
  422. unit_tests/sources/file_based/config/test_csv_format.py +0 -34
  423. unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
  424. unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
  425. unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
  426. unit_tests/sources/file_based/file_types/__init__.py +0 -0
  427. unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
  428. unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
  429. unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
  430. unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
  431. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
  432. unit_tests/sources/file_based/helpers.py +0 -70
  433. unit_tests/sources/file_based/in_memory_files_source.py +0 -211
  434. unit_tests/sources/file_based/scenarios/__init__.py +0 -0
  435. unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
  436. unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
  437. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
  438. unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
  439. unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
  440. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
  441. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
  442. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
  443. unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
  444. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
  445. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
  446. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
  447. unit_tests/sources/file_based/stream/__init__.py +0 -0
  448. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  449. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
  450. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
  451. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
  452. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
  453. unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
  454. unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
  455. unit_tests/sources/file_based/test_scenarios.py +0 -253
  456. unit_tests/sources/file_based/test_schema_helpers.py +0 -346
  457. unit_tests/sources/fixtures/__init__.py +0 -3
  458. unit_tests/sources/fixtures/source_test_fixture.py +0 -153
  459. unit_tests/sources/message/__init__.py +0 -0
  460. unit_tests/sources/message/test_repository.py +0 -153
  461. unit_tests/sources/streams/__init__.py +0 -0
  462. unit_tests/sources/streams/concurrent/__init__.py +0 -3
  463. unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
  464. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
  465. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
  466. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
  467. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
  468. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
  469. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
  470. unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
  471. unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
  472. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
  473. unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
  474. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
  475. unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
  476. unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
  477. unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
  478. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
  479. unit_tests/sources/streams/http/__init__.py +0 -0
  480. unit_tests/sources/streams/http/auth/__init__.py +0 -0
  481. unit_tests/sources/streams/http/auth/test_auth.py +0 -173
  482. unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
  483. unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
  484. unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
  485. unit_tests/sources/streams/http/test_http.py +0 -635
  486. unit_tests/sources/streams/test_availability_strategy.py +0 -70
  487. unit_tests/sources/streams/test_call_rate.py +0 -300
  488. unit_tests/sources/streams/test_stream_read.py +0 -405
  489. unit_tests/sources/streams/test_streams_core.py +0 -184
  490. unit_tests/sources/test_abstract_source.py +0 -1442
  491. unit_tests/sources/test_concurrent_source.py +0 -112
  492. unit_tests/sources/test_config.py +0 -92
  493. unit_tests/sources/test_connector_state_manager.py +0 -482
  494. unit_tests/sources/test_http_logger.py +0 -252
  495. unit_tests/sources/test_integration_source.py +0 -86
  496. unit_tests/sources/test_source.py +0 -684
  497. unit_tests/sources/test_source_read.py +0 -460
  498. unit_tests/test/__init__.py +0 -0
  499. unit_tests/test/mock_http/__init__.py +0 -0
  500. unit_tests/test/mock_http/test_matcher.py +0 -53
  501. unit_tests/test/mock_http/test_mocker.py +0 -214
  502. unit_tests/test/mock_http/test_request.py +0 -117
  503. unit_tests/test/mock_http/test_response_builder.py +0 -177
  504. unit_tests/test/test_entrypoint_wrapper.py +0 -240
  505. unit_tests/utils/__init__.py +0 -0
  506. unit_tests/utils/test_datetime_format_inferrer.py +0 -60
  507. unit_tests/utils/test_mapping_helpers.py +0 -54
  508. unit_tests/utils/test_message_utils.py +0 -91
  509. unit_tests/utils/test_rate_limiting.py +0 -26
  510. unit_tests/utils/test_schema_inferrer.py +0 -202
  511. unit_tests/utils/test_secret_utils.py +0 -135
  512. unit_tests/utils/test_stream_status_utils.py +0 -61
  513. unit_tests/utils/test_traced_exception.py +0 -107
  514. /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
  515. {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
  516. {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
  517. {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
  518. {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.17.1.dev1.dist-info}/LICENSE.txt +0 -0
@@ -0,0 +1,68 @@
1
+ #
2
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import re
6
+ from dataclasses import dataclass
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ import unidecode
10
+
11
+ from airbyte_cdk.sources.declarative.transformations import RecordTransformation
12
+ from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
13
+
14
+
15
+ @dataclass
16
+ class KeysToSnakeCaseTransformation(RecordTransformation):
17
+ token_pattern: re.Pattern[str] = re.compile(
18
+ r"[A-Z]+[a-z]*|[a-z]+|\d+|(?P<NoToken>[^a-zA-Z\d]+)"
19
+ )
20
+
21
+ def transform(
22
+ self,
23
+ record: Dict[str, Any],
24
+ config: Optional[Config] = None,
25
+ stream_state: Optional[StreamState] = None,
26
+ stream_slice: Optional[StreamSlice] = None,
27
+ ) -> None:
28
+ transformed_record = self._transform_record(record)
29
+ record.clear()
30
+ record.update(transformed_record)
31
+
32
+ def _transform_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
33
+ transformed_record = {}
34
+ for key, value in record.items():
35
+ transformed_key = self.process_key(key)
36
+ transformed_value = value
37
+
38
+ if isinstance(value, dict):
39
+ transformed_value = self._transform_record(value)
40
+
41
+ transformed_record[transformed_key] = transformed_value
42
+ return transformed_record
43
+
44
+ def process_key(self, key: str) -> str:
45
+ key = self.normalize_key(key)
46
+ tokens = self.tokenize_key(key)
47
+ tokens = self.filter_tokens(tokens)
48
+ return self.tokens_to_snake_case(tokens)
49
+
50
+ def normalize_key(self, key: str) -> str:
51
+ return unidecode.unidecode(key)
52
+
53
+ def tokenize_key(self, key: str) -> List[str]:
54
+ tokens = []
55
+ for match in self.token_pattern.finditer(key):
56
+ token = match.group(0) if match.group("NoToken") is None else ""
57
+ tokens.append(token)
58
+ return tokens
59
+
60
+ def filter_tokens(self, tokens: List[str]) -> List[str]:
61
+ if len(tokens) >= 3:
62
+ tokens = tokens[:1] + [t for t in tokens[1:-1] if t] + tokens[-1:]
63
+ if tokens and tokens[0].isdigit():
64
+ tokens.insert(0, "")
65
+ return tokens
66
+
67
+ def tokens_to_snake_case(self, tokens: List[str]) -> str:
68
+ return "_".join(token.lower() for token in tokens)
@@ -3,13 +3,14 @@
3
3
  #
4
4
 
5
5
  from dataclasses import InitVar, dataclass
6
- from typing import Any, List, Mapping, Optional
6
+ from typing import Any, Dict, List, Mapping, Optional
7
7
 
8
+ import dpath
8
9
  import dpath.exceptions
9
- import dpath.util
10
+
10
11
  from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean
11
12
  from airbyte_cdk.sources.declarative.transformations import RecordTransformation
12
- from airbyte_cdk.sources.declarative.types import Config, FieldPointer, StreamSlice, StreamState
13
+ from airbyte_cdk.sources.types import Config, FieldPointer, StreamSlice, StreamState
13
14
 
14
15
 
15
16
  @dataclass
@@ -44,15 +45,17 @@ class RemoveFields(RecordTransformation):
44
45
  condition: str = ""
45
46
 
46
47
  def __post_init__(self, parameters: Mapping[str, Any]) -> None:
47
- self._filter_interpolator = InterpolatedBoolean(condition=self.condition, parameters=parameters)
48
+ self._filter_interpolator = InterpolatedBoolean(
49
+ condition=self.condition, parameters=parameters
50
+ )
48
51
 
49
52
  def transform(
50
53
  self,
51
- record: Mapping[str, Any],
54
+ record: Dict[str, Any],
52
55
  config: Optional[Config] = None,
53
56
  stream_state: Optional[StreamState] = None,
54
57
  stream_slice: Optional[StreamSlice] = None,
55
- ) -> Mapping[str, Any]:
58
+ ) -> None:
56
59
  """
57
60
  :param record: The record to be transformed
58
61
  :return: the input record with the requested fields removed
@@ -60,13 +63,13 @@ class RemoveFields(RecordTransformation):
60
63
  for pointer in self.field_pointers:
61
64
  # the dpath library by default doesn't delete fields from arrays
62
65
  try:
63
- dpath.util.delete(
66
+ dpath.delete(
64
67
  record,
65
68
  pointer,
66
- afilter=(lambda x: self._filter_interpolator.eval(config or {}, property=x)) if self.condition else None,
69
+ afilter=(lambda x: self._filter_interpolator.eval(config or {}, property=x))
70
+ if self.condition
71
+ else None,
67
72
  )
68
73
  except dpath.exceptions.PathNotFound:
69
74
  # if the (potentially nested) property does not exist, silently skip
70
75
  pass
71
-
72
- return record
@@ -4,9 +4,9 @@
4
4
 
5
5
  from abc import abstractmethod
6
6
  from dataclasses import dataclass
7
- from typing import Any, Mapping, Optional
7
+ from typing import Any, Dict, Optional
8
8
 
9
- from airbyte_cdk.sources.declarative.types import Config, Record, StreamSlice, StreamState
9
+ from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
10
10
 
11
11
 
12
12
  @dataclass
@@ -18,13 +18,13 @@ class RecordTransformation:
18
18
  @abstractmethod
19
19
  def transform(
20
20
  self,
21
- record: Record,
21
+ record: Dict[str, Any],
22
22
  config: Optional[Config] = None,
23
23
  stream_state: Optional[StreamState] = None,
24
24
  stream_slice: Optional[StreamSlice] = None,
25
- ) -> Mapping[str, Any]:
25
+ ) -> None:
26
26
  """
27
- Transform a record by adding, deleting, or mutating fields.
27
+ Transform a record by adding, deleting, or mutating fields directly from the record reference passed in argument.
28
28
 
29
29
  :param record: The input record to be transformed
30
30
  :param config: The user-provided configuration as specified by the source's spec
@@ -4,113 +4,22 @@
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- from typing import Any, ItemsView, Iterator, KeysView, List, Mapping, Optional, ValuesView
8
-
9
- # A FieldPointer designates a path to a field inside a mapping. For example, retrieving ["k1", "k1.2"] in the object {"k1" :{"k1.2":
10
- # "hello"}] returns "hello"
11
- FieldPointer = List[str]
12
- Config = Mapping[str, Any]
13
- ConnectionDefinition = Mapping[str, Any]
14
- StreamState = Mapping[str, Any]
15
-
16
-
17
- class Record(Mapping[str, Any]):
18
- def __init__(self, data: Mapping[str, Any], associated_slice: Optional[StreamSlice]):
19
- self._data = data
20
- self._associated_slice = associated_slice
21
-
22
- @property
23
- def data(self) -> Mapping[str, Any]:
24
- return self._data
25
-
26
- @property
27
- def associated_slice(self) -> Optional[StreamSlice]:
28
- return self._associated_slice
29
-
30
- def __repr__(self) -> str:
31
- return repr(self._data)
32
-
33
- def __getitem__(self, key: str) -> Any:
34
- return self._data[key]
35
-
36
- def __len__(self) -> int:
37
- return len(self._data)
38
-
39
- def __iter__(self) -> Any:
40
- return iter(self._data)
41
-
42
- def __contains__(self, item: object) -> bool:
43
- return item in self._data
44
-
45
- def __eq__(self, other: object) -> bool:
46
- if isinstance(other, Record):
47
- # noinspection PyProtectedMember
48
- return self._data == other._data
49
- return False
50
-
51
- def __ne__(self, other: object) -> bool:
52
- return not self.__eq__(other)
53
-
54
-
55
- class StreamSlice(Mapping[str, Any]):
56
- def __init__(self, *, partition: Mapping[str, Any], cursor_slice: Mapping[str, Any]) -> None:
57
- self._partition = partition
58
- self._cursor_slice = cursor_slice
59
- if partition.keys() & cursor_slice.keys():
60
- raise ValueError("Keys for partition and incremental sync cursor should not overlap")
61
- self._stream_slice = dict(partition) | dict(cursor_slice)
62
-
63
- @property
64
- def partition(self) -> Mapping[str, Any]:
65
- p = self._partition
66
- while isinstance(p, StreamSlice):
67
- p = p.partition
68
- return p
69
-
70
- @property
71
- def cursor_slice(self) -> Mapping[str, Any]:
72
- c = self._cursor_slice
73
- while isinstance(c, StreamSlice):
74
- c = c.cursor_slice
75
- return c
76
-
77
- def __repr__(self) -> str:
78
- return repr(self._stream_slice)
79
-
80
- def __setitem__(self, key: str, value: Any) -> None:
81
- raise ValueError("StreamSlice is immutable")
82
-
83
- def __getitem__(self, key: str) -> Any:
84
- return self._stream_slice[key]
85
-
86
- def __len__(self) -> int:
87
- return len(self._stream_slice)
88
-
89
- def __iter__(self) -> Iterator[str]:
90
- return iter(self._stream_slice)
91
-
92
- def __contains__(self, item: Any) -> bool:
93
- return item in self._stream_slice
94
-
95
- def keys(self) -> KeysView[str]:
96
- return self._stream_slice.keys()
97
-
98
- def items(self) -> ItemsView[str, Any]:
99
- return self._stream_slice.items()
100
-
101
- def values(self) -> ValuesView[Any]:
102
- return self._stream_slice.values()
103
-
104
- def get(self, key: str, default: Any = None) -> Optional[Any]:
105
- return self._stream_slice.get(key, default)
106
-
107
- def __eq__(self, other: Any) -> bool:
108
- if isinstance(other, dict):
109
- return self._stream_slice == other
110
- if isinstance(other, StreamSlice):
111
- # noinspection PyProtectedMember
112
- return self._partition == other._partition and self._cursor_slice == other._cursor_slice
113
- return False
114
-
115
- def __ne__(self, other: Any) -> bool:
116
- return not self.__eq__(other)
7
+ from airbyte_cdk.sources.types import (
8
+ Config,
9
+ ConnectionDefinition,
10
+ FieldPointer,
11
+ Record,
12
+ StreamSlice,
13
+ StreamState,
14
+ )
15
+
16
+ # Note: This package originally contained class definitions for low-code CDK types, but we promoted them into the Python CDK.
17
+ # We've migrated connectors in the repository to reference the new location, but these assignments are used to retain backwards
18
+ # compatibility for sources created by OSS customers or on forks. This can be removed when we start bumping major versions.
19
+
20
+ FieldPointer = FieldPointer
21
+ Config = Config
22
+ ConnectionDefinition = ConnectionDefinition
23
+ StreamState = StreamState
24
+ Record = Record
25
+ StreamSlice = StreamSlice
@@ -3,31 +3,52 @@
3
3
  #
4
4
 
5
5
  import pkgutil
6
+ from typing import Any, List, Mapping, Optional
6
7
 
7
8
  import yaml
8
- from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
9
- from airbyte_cdk.sources.declarative.types import ConnectionDefinition
10
9
 
10
+ from airbyte_cdk.models import AirbyteStateMessage, ConfiguredAirbyteCatalog
11
+ from airbyte_cdk.sources.declarative.concurrent_declarative_source import (
12
+ ConcurrentDeclarativeSource,
13
+ )
14
+ from airbyte_cdk.sources.types import ConnectionDefinition
11
15
 
12
- class YamlDeclarativeSource(ManifestDeclarativeSource):
16
+
17
+ class YamlDeclarativeSource(ConcurrentDeclarativeSource[List[AirbyteStateMessage]]):
13
18
  """Declarative source defined by a yaml file"""
14
19
 
15
- def __init__(self, path_to_yaml, debug: bool = False):
20
+ def __init__(
21
+ self,
22
+ path_to_yaml: str,
23
+ debug: bool = False,
24
+ catalog: Optional[ConfiguredAirbyteCatalog] = None,
25
+ config: Optional[Mapping[str, Any]] = None,
26
+ state: Optional[List[AirbyteStateMessage]] = None,
27
+ ) -> None:
16
28
  """
17
29
  :param path_to_yaml: Path to the yaml file describing the source
18
30
  """
19
31
  self._path_to_yaml = path_to_yaml
20
32
  source_config = self._read_and_parse_yaml_file(path_to_yaml)
21
- super().__init__(source_config, debug)
22
33
 
23
- def _read_and_parse_yaml_file(self, path_to_yaml_file) -> ConnectionDefinition:
34
+ super().__init__(
35
+ catalog=catalog or ConfiguredAirbyteCatalog(streams=[]),
36
+ config=config or {},
37
+ state=state or [],
38
+ source_config=source_config,
39
+ )
40
+
41
+ def _read_and_parse_yaml_file(self, path_to_yaml_file: str) -> ConnectionDefinition:
24
42
  package = self.__class__.__module__.split(".")[0]
25
43
 
26
44
  yaml_config = pkgutil.get_data(package, path_to_yaml_file)
27
- decoded_yaml = yaml_config.decode()
28
- return self._parse(decoded_yaml)
45
+ if yaml_config:
46
+ decoded_yaml = yaml_config.decode()
47
+ return self._parse(decoded_yaml)
48
+ else:
49
+ return {}
29
50
 
30
- def _emit_manifest_debug_message(self, extra_args: dict):
51
+ def _emit_manifest_debug_message(self, extra_args: dict[str, Any]) -> None:
31
52
  extra_args["path_to_yaml"] = self._path_to_yaml
32
53
  self.logger.debug("declarative source created from parsed YAML manifest", extra=extra_args)
33
54
 
@@ -39,4 +60,4 @@ class YamlDeclarativeSource(ManifestDeclarativeSource):
39
60
  :param connection_definition_str: yaml string to parse
40
61
  :return: The ConnectionDefinition parsed from connection_definition_str
41
62
  """
42
- return yaml.safe_load(connection_definition_str)
63
+ return yaml.safe_load(connection_definition_str) # type: ignore # yaml.safe_load doesn't return a type but know it is a Mapping
@@ -6,11 +6,15 @@ from abc import ABC, abstractmethod
6
6
  from typing import Generic, Iterable, Optional, TypeVar
7
7
 
8
8
  from airbyte_cdk.connector import TConfig
9
- from airbyte_cdk.sources.embedded.catalog import create_configured_catalog, get_stream, get_stream_names
9
+ from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStateMessage, SyncMode, Type
10
+ from airbyte_cdk.sources.embedded.catalog import (
11
+ create_configured_catalog,
12
+ get_stream,
13
+ get_stream_names,
14
+ )
10
15
  from airbyte_cdk.sources.embedded.runner import SourceRunner
11
16
  from airbyte_cdk.sources.embedded.tools import get_defined_id
12
17
  from airbyte_cdk.sources.utils.schema_helpers import check_config_against_spec_or_exit
13
- from airbyte_protocol.models import AirbyteRecordMessage, AirbyteStateMessage, SyncMode, Type
14
18
 
15
19
  TOutput = TypeVar("TOutput")
16
20
 
@@ -31,11 +35,15 @@ class BaseEmbeddedIntegration(ABC, Generic[TConfig, TOutput]):
31
35
  """
32
36
  pass
33
37
 
34
- def _load_data(self, stream_name: str, state: Optional[AirbyteStateMessage] = None) -> Iterable[TOutput]:
38
+ def _load_data(
39
+ self, stream_name: str, state: Optional[AirbyteStateMessage] = None
40
+ ) -> Iterable[TOutput]:
35
41
  catalog = self.source.discover(self.config)
36
42
  stream = get_stream(catalog, stream_name)
37
43
  if not stream:
38
- raise ValueError(f"Stream {stream_name} not found, the following streams are available: {', '.join(get_stream_names(catalog))}")
44
+ raise ValueError(
45
+ f"Stream {stream_name} not found, the following streams are available: {', '.join(get_stream_names(catalog))}"
46
+ )
39
47
  if SyncMode.incremental not in stream.supported_sync_modes:
40
48
  configured_catalog = create_configured_catalog(stream, sync_mode=SyncMode.full_refresh)
41
49
  else:
@@ -43,7 +51,10 @@ class BaseEmbeddedIntegration(ABC, Generic[TConfig, TOutput]):
43
51
 
44
52
  for message in self.source.read(self.config, configured_catalog, state):
45
53
  if message.type == Type.RECORD:
46
- output = self._handle_record(message.record, get_defined_id(stream, message.record.data))
54
+ output = self._handle_record(
55
+ message.record,
56
+ get_defined_id(stream, message.record.data), # type: ignore[union-attr, arg-type]
57
+ )
47
58
  if output:
48
59
  yield output
49
60
  elif message.type is Type.STATE and message.state:
@@ -31,15 +31,27 @@ def to_configured_stream(
31
31
  primary_key: Optional[List[List[str]]] = None,
32
32
  ) -> ConfiguredAirbyteStream:
33
33
  return ConfiguredAirbyteStream(
34
- stream=stream, sync_mode=sync_mode, destination_sync_mode=destination_sync_mode, cursor_field=cursor_field, primary_key=primary_key
34
+ stream=stream,
35
+ sync_mode=sync_mode,
36
+ destination_sync_mode=destination_sync_mode,
37
+ cursor_field=cursor_field,
38
+ primary_key=primary_key,
35
39
  )
36
40
 
37
41
 
38
- def to_configured_catalog(configured_streams: List[ConfiguredAirbyteStream]) -> ConfiguredAirbyteCatalog:
42
+ def to_configured_catalog(
43
+ configured_streams: List[ConfiguredAirbyteStream],
44
+ ) -> ConfiguredAirbyteCatalog:
39
45
  return ConfiguredAirbyteCatalog(streams=configured_streams)
40
46
 
41
47
 
42
- def create_configured_catalog(stream: AirbyteStream, sync_mode: SyncMode = SyncMode.full_refresh) -> ConfiguredAirbyteCatalog:
43
- configured_streams = [to_configured_stream(stream, sync_mode=sync_mode, primary_key=stream.source_defined_primary_key)]
48
+ def create_configured_catalog(
49
+ stream: AirbyteStream, sync_mode: SyncMode = SyncMode.full_refresh
50
+ ) -> ConfiguredAirbyteCatalog:
51
+ configured_streams = [
52
+ to_configured_stream(
53
+ stream, sync_mode=sync_mode, primary_key=stream.source_defined_primary_key
54
+ )
55
+ ]
44
56
 
45
57
  return to_configured_catalog(configured_streams)
@@ -8,7 +8,13 @@ from abc import ABC, abstractmethod
8
8
  from typing import Generic, Iterable, Optional
9
9
 
10
10
  from airbyte_cdk.connector import TConfig
11
- from airbyte_cdk.models import AirbyteCatalog, AirbyteMessage, AirbyteStateMessage, ConfiguredAirbyteCatalog, ConnectorSpecification
11
+ from airbyte_cdk.models import (
12
+ AirbyteCatalog,
13
+ AirbyteMessage,
14
+ AirbyteStateMessage,
15
+ ConfiguredAirbyteCatalog,
16
+ ConnectorSpecification,
17
+ )
12
18
  from airbyte_cdk.sources.source import Source
13
19
 
14
20
 
@@ -22,7 +28,12 @@ class SourceRunner(ABC, Generic[TConfig]):
22
28
  pass
23
29
 
24
30
  @abstractmethod
25
- def read(self, config: TConfig, catalog: ConfiguredAirbyteCatalog, state: Optional[AirbyteStateMessage]) -> Iterable[AirbyteMessage]:
31
+ def read(
32
+ self,
33
+ config: TConfig,
34
+ catalog: ConfiguredAirbyteCatalog,
35
+ state: Optional[AirbyteStateMessage],
36
+ ) -> Iterable[AirbyteMessage]:
26
37
  pass
27
38
 
28
39
 
@@ -37,5 +48,10 @@ class CDKRunner(SourceRunner[TConfig]):
37
48
  def discover(self, config: TConfig) -> AirbyteCatalog:
38
49
  return self._source.discover(self._logger, config)
39
50
 
40
- def read(self, config: TConfig, catalog: ConfiguredAirbyteCatalog, state: Optional[AirbyteStateMessage]) -> Iterable[AirbyteMessage]:
51
+ def read(
52
+ self,
53
+ config: TConfig,
54
+ catalog: ConfiguredAirbyteCatalog,
55
+ state: Optional[AirbyteStateMessage],
56
+ ) -> Iterable[AirbyteMessage]:
41
57
  return self._source.read(self._logger, config, catalog, state=[state] if state else [])
@@ -5,10 +5,13 @@
5
5
  from typing import Any, Callable, Dict, Iterable, Optional
6
6
 
7
7
  import dpath
8
+
8
9
  from airbyte_cdk.models import AirbyteStream
9
10
 
10
11
 
11
- def get_first(iterable: Iterable[Any], predicate: Callable[[Any], bool] = lambda m: True) -> Optional[Any]:
12
+ def get_first(
13
+ iterable: Iterable[Any], predicate: Callable[[Any], bool] = lambda m: True
14
+ ) -> Optional[Any]:
12
15
  return next(filter(predicate, iterable), None)
13
16
 
14
17
 
@@ -18,7 +21,7 @@ def get_defined_id(stream: AirbyteStream, data: Dict[str, Any]) -> Optional[str]
18
21
  primary_key = []
19
22
  for key in stream.source_defined_primary_key:
20
23
  try:
21
- primary_key.append(str(dpath.util.get(data, key)))
24
+ primary_key.append(str(dpath.get(data, key)))
22
25
  except KeyError:
23
26
  primary_key.append("__not_found__")
24
27
  return "_".join(primary_key)
@@ -0,0 +1,152 @@
1
+ ## Behavior
2
+
3
+ The Airbyte protocol defines the actions `spec`, `discover`, `check` and `read` for a source to be compliant. Here is the high-level description of the flow for a file-based source:
4
+
5
+ - spec: calls AbstractFileBasedSpec.documentation_url and AbstractFileBasedSpec.schema to return a ConnectorSpecification.
6
+ - discover: calls Source.streams, and subsequently Stream.get_json_schema; this uses Source.open_file to open files during schema discovery.
7
+ - check: Source.check_connection is called from the entrypoint code (in the main CDK).
8
+ - read: Stream.read_records calls Stream.list_files which calls Source.list_matching_files, and then also uses Source.open_file to parse records from the file handle.
9
+
10
+ ## How to Implement Your Own
11
+
12
+ To create a file-based source a user must extend three classes – AbstractFileBasedSource, AbstractFileBasedSpec, and AbstractStreamReader – to create an implementation for the connector’s specific storage system. They then initialize a FileBasedSource with the instance of AbstractStreamReader specific to their storage system.
13
+
14
+ The abstract classes house the vast majority of the logic required by file-based sources. For example, when extending AbstractStreamReader, users only have to implement three methods:
15
+
16
+ - list_matching_files: lists files matching the glob pattern(s) provided in the config.
17
+ - open_file: returns a file handle for reading.
18
+ - config property setter: concrete implementations of AbstractFileBasedStreamReader's config setter should assert that `value` is of the correct config type for that type of StreamReader.
19
+
20
+ The result is that an implementation of a source might look like this:
21
+
22
+ ```
23
+ class CustomStreamReader(AbstractStreamReader):
24
+ def open_file(self, remote_file: RemoteFile) -> FileHandler:
25
+ <...>
26
+
27
+ def get_matching_files(
28
+ self,
29
+ globs: List[str],
30
+ logger: logging.Logger,
31
+ ) -> Iterable[RemoteFile]:
32
+ <...>
33
+
34
+ @config.setter
35
+ def config(self, value: Config):
36
+ assert isinstance(value, CustomConfig)
37
+ self._config = value
38
+
39
+
40
+ class CustomConfig(AbstractFileBasedSpec):
41
+ @classmethod
42
+ def documentation_url(cls) -> AnyUrl:
43
+ return AnyUrl("https://docs.airbyte.com/integrations/sources/s3", scheme="https")
44
+
45
+ a_spec_field: str = Field(title="A Spec Field", description="This is where you describe the fields of the spec", order=0)
46
+ <...>
47
+ ```
48
+
49
+ For more information, feel free to check the docstrings of each classes or check specific implementations (like source-s3).
50
+
51
+ ## Supported File Types
52
+
53
+ ### Avro
54
+
55
+ Avro is a serialization format developed by [Apache](https://avro.apache.org/docs/). Avro configuration options for the file-based CDK:
56
+
57
+ - `double_as_string`: Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers.
58
+
59
+ ### CSV
60
+
61
+ CSV is a format loosely described by [RFC 4180](https://www.rfc-editor.org/rfc/rfc4180). The format is quite flexible which leads to a ton of options to consider:
62
+
63
+ - `delimiter`: The character delimiting individual cells in the CSV data. By name, CSV is comma separated so the default value is `,`
64
+ - `quote_char`: When quoted fields are used, it is possible for a field to span multiple lines, even when line breaks appear within such field. The default quote character is `"`.
65
+ - `escape_char`: The character used for escaping special characters.
66
+ - `encoding`: The character encoding of the file. By default, `UTF-8`
67
+ - `double_quote`: Whether two quotes in a quoted CSV value denote a single quote in the data.
68
+ - `quoting_behavior`: The quoting behavior determines when a value in a row should have quote marks added around it.
69
+ - `skip_rows_before_header`: The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.
70
+ - `skip_rows_after_header`: The number of rows to skip after the header row.
71
+ - `autogenerate_column_names`: If your CSV does not have a header row, the file-based CDK will need this enable to generate column names.
72
+ - `null_values`: As CSV does not explicitly define a value for null values, the user can specify a set of case-sensitive strings that should be interpreted as null values.
73
+ - `true_values`: As CSV does not explicitly define a value for positive boolean, the user can specify a set of case-sensitive strings that should be interpreted as true values.
74
+ - `false_values`: As CSV does not explicitly define a value for negative boolean, the user can specify a set of case-sensitive strings that should be interpreted as false values.
75
+
76
+ ### JSONL
77
+
78
+ [JSONL](https://jsonlines.org/) (or JSON Lines) is a format where each row is a JSON object. There are no configuration option for this format. For backward compatibility reasons, the JSONL parser currently supports multiline objects even though this is not part of the JSONL standard. Following some data gathering, we reserve the right to remove the support for this. Given that files have multiline JSON objects, performances will be slow.
79
+
80
+ ### Parquet
81
+
82
+ Parquet is a file format defined by [Apache](https://parquet.apache.org/). Configuration options are:
83
+
84
+ - `decimal_as_float`: Whether to convert decimal fields to floats. There is a loss of precision when converting decimals to floats, so this is not recommended.
85
+
86
+ ### Document file types (PDF, DOCX, Markdown)
87
+
88
+ For file share source connectors, the `unstructured` parser can be used to parse document file types. The textual content of the whole file will be parsed as a single record with a `content` field containing the text encoded as markdown.
89
+
90
+ To use the unstructured parser, the libraries `poppler` and `tesseract` need to be installed on the system running the connector. For example, on Ubuntu, you can install them with the following command:
91
+
92
+ ```
93
+ apt-get install -y tesseract-ocr poppler-utils
94
+ ```
95
+
96
+ on Mac, you can install these via brew:
97
+
98
+ ```
99
+ brew install poppler
100
+ brew install tesseract
101
+ ```
102
+
103
+ ## Schema
104
+
105
+ Having a schema allows for the file-based CDK to take action when there is a discrepancy between a record and what are the expected types of the record fields.
106
+
107
+ Schema can be either inferred or user provided.
108
+
109
+ - If the user defines it a format using JSON types, inference will not apply. Input schemas are a key/value pair of strings describing column name and data type. Supported types are `["string", "number", "integer", "object", "array", "boolean", "null"]`. For example, `{"col1": "string", "col2": "boolean"}`.
110
+ - If the user enables schemaless sync, schema will `{"data": "object"}` and therefore emitted records will look like `{"data": {"col1": val1, …}}`. This is recommended if the contents between files in the stream vary significantly, and/or if data is very nested.
111
+ - Else, the file-based CDK will infer the schema depending on the file type. Some file formats defined the schema as part of their metadata (like Parquet), some do on the record-level (like Avro) and some don't have any explicit typing (like JSON or CSV). Note that all CSV values are inferred as strings except where we are supporting legacy configurations. Any file format that does not define their schema on a metadata level will require the file-based CDK to iterate to a number of records. There is a limit of bytes that will be consumed in order to infer the schema.
112
+
113
+ ### Validation Policies
114
+
115
+ Users will be required to select one of 3 different options, in the event that records are encountered that don’t conform to the schema.
116
+
117
+ - Skip nonconforming records: check each record to see if it conforms to the user-input or inferred schema; skip the record if it doesn't conform. We keep a count of the number of records in each file that do and do not conform and emit a log message with these counts once we’re done reading the file.
118
+ - Emit all records: emit all records, even if they do not conform to the user-provided or inferred schema. Columns that don't exist in the configured catalog probably won't be available in the destination's table since that's the current behavior.
119
+ Only error if there are conflicting field types or malformed rows.
120
+ - Stop the sync and wait for schema re-discovery: if a record is encountered that does not conform to the configured catalog’s schema, we log a message and stop the whole sync. Note: this option is not recommended if the files have very different columns or datatypes, because the inferred schema may vary significantly at discover time.
121
+
122
+ When the `schemaless` is enabled, validation will be skipped.
123
+
124
+ ## Breaking Changes (compared to previous S3 implementation)
125
+
126
+ - [CSV] Mapping of type `array` and `object`: before, they were mapped as `large_string` and hence casted as strings. Given the new changes, if `array` or `object` is specified, the value will be casted as `array` and `object` respectively.
127
+ - [CSV] Before, a string value would not be considered as `null_values` if the column type was a string. We will now start to cast string columns with values matching `null_values` to null.
128
+ - [CSV] `decimal_point` option is deprecated: It is not possible anymore to use another character than `.` to separate the integer part from non-integer part. Given that the float is format with another character than this, it will be considered as a string.
129
+ - [Parquet] `columns` option is deprecated: You can use Airbyte column selection in order to have the same behavior. We don't expect it, but this could have impact on the performance as payload could be bigger.
130
+
131
+ ## Incremental syncs
132
+
133
+ The file-based connectors supports the following [sync modes](https://docs.airbyte.com/cloud/core-concepts#connection-sync-modes):
134
+
135
+ | Feature | Supported? |
136
+ | :--------------------------------------------- | :--------- |
137
+ | Full Refresh Sync | Yes |
138
+ | Incremental Sync | Yes |
139
+ | Replicate Incremental Deletes | No |
140
+ | Replicate Multiple Files \(pattern matching\) | Yes |
141
+ | Replicate Multiple Streams \(distinct tables\) | Yes |
142
+ | Namespaces | No |
143
+
144
+ We recommend you do not manually modify files that are already synced. The connector has file-level granularity, which means adding or modifying a row in a CSV file will trigger a re-sync of the content of that file.
145
+
146
+ ### Incremental sync
147
+
148
+ After the initial sync, the connector only pulls files that were modified since the last sync.
149
+
150
+ The connector checkpoints the connection states when it is done syncing all files for a given timestamp. The connection's state only keeps track of the last 10 000 files synced. If more than 10 000 files are synced, the connector won't be able to rely on the connection state to deduplicate files. In this case, the connector will initialize its cursor to the minimum between the earliest file in the history, or 3 days ago.
151
+
152
+ Both the maximum number of files, and the time buffer can be configured by connector developers.