airbyte-cdk 0.72.0__py3-none-any.whl → 6.13.1.dev4106__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (517) hide show
  1. airbyte_cdk/__init__.py +355 -6
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +29 -10
  7. airbyte_cdk/connector.py +24 -24
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
  10. airbyte_cdk/connector_builder/main.py +45 -13
  11. airbyte_cdk/connector_builder/message_grouper.py +189 -50
  12. airbyte_cdk/connector_builder/models.py +3 -2
  13. airbyte_cdk/destinations/__init__.py +4 -3
  14. airbyte_cdk/destinations/destination.py +54 -20
  15. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  16. airbyte_cdk/destinations/vector_db_based/config.py +40 -17
  17. airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
  18. airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
  19. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  20. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  21. airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
  22. airbyte_cdk/entrypoint.py +153 -44
  23. airbyte_cdk/exception_handler.py +21 -3
  24. airbyte_cdk/logger.py +30 -44
  25. airbyte_cdk/models/__init__.py +13 -2
  26. airbyte_cdk/models/airbyte_protocol.py +86 -1
  27. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  28. airbyte_cdk/models/file_transfer_record_message.py +13 -0
  29. airbyte_cdk/models/well_known_types.py +1 -1
  30. airbyte_cdk/sources/__init__.py +5 -1
  31. airbyte_cdk/sources/abstract_source.py +125 -79
  32. airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
  33. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
  34. airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
  35. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
  36. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  37. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
  38. airbyte_cdk/sources/config.py +3 -2
  39. airbyte_cdk/sources/connector_state_manager.py +49 -83
  40. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  41. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
  42. airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
  43. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  44. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  45. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  46. airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
  47. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  48. airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
  49. airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
  50. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
  51. airbyte_cdk/sources/declarative/auth/token.py +28 -10
  52. airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
  53. airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
  54. airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
  55. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  56. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  57. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +421 -0
  58. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
  59. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
  60. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1213 -88
  61. airbyte_cdk/sources/declarative/declarative_source.py +5 -2
  62. airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
  63. airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
  64. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  65. airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
  66. airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
  67. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  68. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  69. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  70. airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
  71. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
  72. airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
  73. airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
  74. airbyte_cdk/sources/declarative/extractors/record_filter.py +65 -8
  75. airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
  76. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
  77. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  78. airbyte_cdk/sources/declarative/incremental/__init__.py +25 -3
  79. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
  80. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  81. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
  82. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +159 -74
  83. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  84. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  85. airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
  86. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
  87. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
  88. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
  89. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
  90. airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
  91. airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
  92. airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
  93. airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
  94. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  95. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  96. airbyte_cdk/sources/declarative/models/__init__.py +1 -1
  97. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1329 -595
  98. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
  99. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
  100. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
  101. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1699 -226
  102. airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
  103. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  104. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  105. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
  106. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  107. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
  108. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
  109. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  110. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  111. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
  112. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
  113. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
  114. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
  115. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
  116. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
  117. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
  118. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
  119. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  120. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
  121. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
  122. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
  123. airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
  124. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  125. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
  126. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
  127. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
  128. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  129. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
  130. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
  131. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
  132. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
  133. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
  134. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
  135. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
  136. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  137. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
  138. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
  139. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
  140. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
  141. airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
  142. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  143. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  144. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  145. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  146. airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
  147. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
  148. airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
  149. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +228 -72
  150. airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
  151. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
  152. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
  153. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
  154. airbyte_cdk/sources/declarative/spec/spec.py +12 -5
  155. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
  156. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
  157. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
  158. airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
  159. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  160. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  161. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  162. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  163. airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
  164. airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
  165. airbyte_cdk/sources/declarative/types.py +19 -110
  166. airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
  167. airbyte_cdk/sources/embedded/base_integration.py +16 -5
  168. airbyte_cdk/sources/embedded/catalog.py +16 -4
  169. airbyte_cdk/sources/embedded/runner.py +19 -3
  170. airbyte_cdk/sources/embedded/tools.py +5 -2
  171. airbyte_cdk/sources/file_based/README.md +152 -0
  172. airbyte_cdk/sources/file_based/__init__.py +24 -0
  173. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  174. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
  175. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
  176. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +58 -10
  177. airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
  178. airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
  179. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  180. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
  181. airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
  182. airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
  183. airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
  184. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  185. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  186. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  187. airbyte_cdk/sources/file_based/exceptions.py +52 -15
  188. airbyte_cdk/sources/file_based/file_based_source.py +163 -33
  189. airbyte_cdk/sources/file_based/file_based_stream_reader.py +83 -5
  190. airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
  191. airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
  192. airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
  193. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  194. airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
  195. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  196. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
  197. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
  198. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +145 -41
  199. airbyte_cdk/sources/file_based/remote_file.py +1 -1
  200. airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
  201. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  202. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  203. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  204. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
  205. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
  206. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  207. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
  208. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
  209. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
  210. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  211. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
  212. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +175 -45
  213. airbyte_cdk/sources/http_logger.py +8 -3
  214. airbyte_cdk/sources/message/__init__.py +7 -1
  215. airbyte_cdk/sources/message/repository.py +18 -4
  216. airbyte_cdk/sources/source.py +42 -38
  217. airbyte_cdk/sources/streams/__init__.py +2 -2
  218. airbyte_cdk/sources/streams/availability_strategy.py +54 -3
  219. airbyte_cdk/sources/streams/call_rate.py +64 -21
  220. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  221. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  222. airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
  223. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  224. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  225. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  226. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  227. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
  228. airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
  229. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
  230. airbyte_cdk/sources/streams/concurrent/cursor.py +298 -42
  231. airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
  232. airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
  233. airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
  234. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
  235. airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
  236. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
  237. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  238. airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
  239. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
  240. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
  241. airbyte_cdk/sources/streams/core.py +412 -87
  242. airbyte_cdk/sources/streams/http/__init__.py +2 -1
  243. airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
  244. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  245. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  246. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  247. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  248. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  249. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  250. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  251. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  252. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  253. airbyte_cdk/sources/streams/http/exceptions.py +27 -7
  254. airbyte_cdk/sources/streams/http/http.py +369 -246
  255. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  256. airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
  257. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
  258. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  259. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
  260. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  261. airbyte_cdk/sources/types.py +154 -0
  262. airbyte_cdk/sources/utils/record_helper.py +36 -21
  263. airbyte_cdk/sources/utils/schema_helpers.py +13 -6
  264. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  265. airbyte_cdk/sources/utils/transform.py +54 -20
  266. airbyte_cdk/sql/_util/hashing.py +34 -0
  267. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  268. airbyte_cdk/sql/constants.py +32 -0
  269. airbyte_cdk/sql/exceptions.py +235 -0
  270. airbyte_cdk/sql/secrets.py +123 -0
  271. airbyte_cdk/sql/shared/__init__.py +15 -0
  272. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  273. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  274. airbyte_cdk/sql/types.py +160 -0
  275. airbyte_cdk/test/catalog_builder.py +70 -18
  276. airbyte_cdk/test/entrypoint_wrapper.py +117 -42
  277. airbyte_cdk/test/mock_http/__init__.py +1 -1
  278. airbyte_cdk/test/mock_http/matcher.py +6 -0
  279. airbyte_cdk/test/mock_http/mocker.py +57 -10
  280. airbyte_cdk/test/mock_http/request.py +19 -3
  281. airbyte_cdk/test/mock_http/response.py +3 -1
  282. airbyte_cdk/test/mock_http/response_builder.py +32 -16
  283. airbyte_cdk/test/state_builder.py +18 -10
  284. airbyte_cdk/test/utils/__init__.py +1 -0
  285. airbyte_cdk/test/utils/data.py +24 -0
  286. airbyte_cdk/test/utils/http_mocking.py +16 -0
  287. airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
  288. airbyte_cdk/test/utils/reading.py +26 -0
  289. airbyte_cdk/utils/__init__.py +2 -1
  290. airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
  291. airbyte_cdk/utils/analytics_message.py +10 -2
  292. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  293. airbyte_cdk/utils/event_timing.py +10 -10
  294. airbyte_cdk/utils/mapping_helpers.py +3 -1
  295. airbyte_cdk/utils/message_utils.py +20 -11
  296. airbyte_cdk/utils/print_buffer.py +75 -0
  297. airbyte_cdk/utils/schema_inferrer.py +198 -28
  298. airbyte_cdk/utils/slice_hasher.py +30 -0
  299. airbyte_cdk/utils/spec_schema_transformations.py +6 -3
  300. airbyte_cdk/utils/stream_status_utils.py +8 -1
  301. airbyte_cdk/utils/traced_exception.py +61 -21
  302. airbyte_cdk-6.13.1.dev4106.dist-info/METADATA +109 -0
  303. airbyte_cdk-6.13.1.dev4106.dist-info/RECORD +349 -0
  304. {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/WHEEL +1 -2
  305. airbyte_cdk-6.13.1.dev4106.dist-info/entry_points.txt +3 -0
  306. airbyte_cdk/sources/declarative/create_partial.py +0 -92
  307. airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
  308. airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
  309. airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
  310. airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
  311. airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
  312. airbyte_cdk/sources/deprecated/base_source.py +0 -94
  313. airbyte_cdk/sources/deprecated/client.py +0 -99
  314. airbyte_cdk/sources/singer/__init__.py +0 -8
  315. airbyte_cdk/sources/singer/singer_helpers.py +0 -304
  316. airbyte_cdk/sources/singer/source.py +0 -186
  317. airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
  318. airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
  319. airbyte_cdk/sources/streams/http/auth/core.py +0 -29
  320. airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
  321. airbyte_cdk/sources/streams/http/auth/token.py +0 -47
  322. airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
  323. airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
  324. airbyte_cdk/sources/utils/schema_models.py +0 -84
  325. airbyte_cdk-0.72.0.dist-info/METADATA +0 -243
  326. airbyte_cdk-0.72.0.dist-info/RECORD +0 -466
  327. airbyte_cdk-0.72.0.dist-info/top_level.txt +0 -3
  328. source_declarative_manifest/main.py +0 -29
  329. unit_tests/connector_builder/__init__.py +0 -3
  330. unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
  331. unit_tests/connector_builder/test_message_grouper.py +0 -713
  332. unit_tests/connector_builder/utils.py +0 -27
  333. unit_tests/destinations/test_destination.py +0 -243
  334. unit_tests/singer/test_singer_helpers.py +0 -56
  335. unit_tests/singer/test_singer_source.py +0 -112
  336. unit_tests/sources/__init__.py +0 -0
  337. unit_tests/sources/concurrent_source/__init__.py +0 -3
  338. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
  339. unit_tests/sources/declarative/__init__.py +0 -3
  340. unit_tests/sources/declarative/auth/__init__.py +0 -3
  341. unit_tests/sources/declarative/auth/test_oauth.py +0 -331
  342. unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
  343. unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
  344. unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
  345. unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
  346. unit_tests/sources/declarative/checks/__init__.py +0 -3
  347. unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
  348. unit_tests/sources/declarative/decoders/__init__.py +0 -0
  349. unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
  350. unit_tests/sources/declarative/external_component.py +0 -13
  351. unit_tests/sources/declarative/extractors/__init__.py +0 -3
  352. unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
  353. unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
  354. unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
  355. unit_tests/sources/declarative/incremental/__init__.py +0 -0
  356. unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
  357. unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
  358. unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
  359. unit_tests/sources/declarative/interpolation/__init__.py +0 -3
  360. unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
  361. unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
  362. unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
  363. unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
  364. unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
  365. unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
  366. unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
  367. unit_tests/sources/declarative/parsers/__init__.py +0 -3
  368. unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
  369. unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
  370. unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1841
  371. unit_tests/sources/declarative/parsers/testing_components.py +0 -36
  372. unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
  373. unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
  374. unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
  375. unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
  376. unit_tests/sources/declarative/requesters/__init__.py +0 -3
  377. unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
  378. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
  379. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
  380. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
  381. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
  382. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
  383. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
  384. unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
  385. unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
  386. unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
  387. unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
  388. unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
  389. unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
  390. unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
  391. unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
  392. unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
  393. unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
  394. unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
  395. unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
  396. unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
  397. unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
  398. unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
  399. unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
  400. unit_tests/sources/declarative/retrievers/__init__.py +0 -3
  401. unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
  402. unit_tests/sources/declarative/schema/__init__.py +0 -6
  403. unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
  404. unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
  405. unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
  406. unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
  407. unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
  408. unit_tests/sources/declarative/states/__init__.py +0 -3
  409. unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
  410. unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
  411. unit_tests/sources/declarative/test_create_partial.py +0 -83
  412. unit_tests/sources/declarative/test_declarative_stream.py +0 -103
  413. unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
  414. unit_tests/sources/declarative/test_types.py +0 -39
  415. unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
  416. unit_tests/sources/file_based/__init__.py +0 -0
  417. unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
  418. unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
  419. unit_tests/sources/file_based/config/__init__.py +0 -0
  420. unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
  421. unit_tests/sources/file_based/config/test_csv_format.py +0 -34
  422. unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
  423. unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
  424. unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
  425. unit_tests/sources/file_based/file_types/__init__.py +0 -0
  426. unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
  427. unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
  428. unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
  429. unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
  430. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
  431. unit_tests/sources/file_based/helpers.py +0 -70
  432. unit_tests/sources/file_based/in_memory_files_source.py +0 -211
  433. unit_tests/sources/file_based/scenarios/__init__.py +0 -0
  434. unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
  435. unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
  436. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
  437. unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
  438. unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
  439. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
  440. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
  441. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
  442. unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
  443. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
  444. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
  445. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
  446. unit_tests/sources/file_based/stream/__init__.py +0 -0
  447. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  448. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
  449. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
  450. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
  451. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
  452. unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
  453. unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
  454. unit_tests/sources/file_based/test_scenarios.py +0 -253
  455. unit_tests/sources/file_based/test_schema_helpers.py +0 -346
  456. unit_tests/sources/fixtures/__init__.py +0 -3
  457. unit_tests/sources/fixtures/source_test_fixture.py +0 -153
  458. unit_tests/sources/message/__init__.py +0 -0
  459. unit_tests/sources/message/test_repository.py +0 -153
  460. unit_tests/sources/streams/__init__.py +0 -0
  461. unit_tests/sources/streams/concurrent/__init__.py +0 -3
  462. unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
  463. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
  464. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
  465. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
  466. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
  467. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
  468. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
  469. unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
  470. unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
  471. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
  472. unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
  473. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
  474. unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
  475. unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
  476. unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
  477. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
  478. unit_tests/sources/streams/http/__init__.py +0 -0
  479. unit_tests/sources/streams/http/auth/__init__.py +0 -0
  480. unit_tests/sources/streams/http/auth/test_auth.py +0 -173
  481. unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
  482. unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
  483. unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
  484. unit_tests/sources/streams/http/test_http.py +0 -635
  485. unit_tests/sources/streams/test_availability_strategy.py +0 -70
  486. unit_tests/sources/streams/test_call_rate.py +0 -300
  487. unit_tests/sources/streams/test_stream_read.py +0 -405
  488. unit_tests/sources/streams/test_streams_core.py +0 -184
  489. unit_tests/sources/test_abstract_source.py +0 -1442
  490. unit_tests/sources/test_concurrent_source.py +0 -112
  491. unit_tests/sources/test_config.py +0 -92
  492. unit_tests/sources/test_connector_state_manager.py +0 -482
  493. unit_tests/sources/test_http_logger.py +0 -252
  494. unit_tests/sources/test_integration_source.py +0 -86
  495. unit_tests/sources/test_source.py +0 -684
  496. unit_tests/sources/test_source_read.py +0 -460
  497. unit_tests/test/__init__.py +0 -0
  498. unit_tests/test/mock_http/__init__.py +0 -0
  499. unit_tests/test/mock_http/test_matcher.py +0 -53
  500. unit_tests/test/mock_http/test_mocker.py +0 -214
  501. unit_tests/test/mock_http/test_request.py +0 -117
  502. unit_tests/test/mock_http/test_response_builder.py +0 -177
  503. unit_tests/test/test_entrypoint_wrapper.py +0 -240
  504. unit_tests/utils/__init__.py +0 -0
  505. unit_tests/utils/test_datetime_format_inferrer.py +0 -60
  506. unit_tests/utils/test_mapping_helpers.py +0 -54
  507. unit_tests/utils/test_message_utils.py +0 -91
  508. unit_tests/utils/test_rate_limiting.py +0 -26
  509. unit_tests/utils/test_schema_inferrer.py +0 -202
  510. unit_tests/utils/test_secret_utils.py +0 -135
  511. unit_tests/utils/test_stream_status_utils.py +0 -61
  512. unit_tests/utils/test_traced_exception.py +0 -107
  513. /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
  514. {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
  515. {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
  516. {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
  517. {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/LICENSE.txt +0 -0
@@ -1,3105 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- from airbyte_cdk.models import AirbyteAnalyticsTraceMessage
6
- from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
7
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
8
- from airbyte_cdk.test.catalog_builder import CatalogBuilder
9
- from airbyte_cdk.utils.traced_exception import AirbyteTracedException
10
- from airbyte_protocol.models import SyncMode
11
- from unit_tests.sources.file_based.helpers import EmptySchemaParser, LowInferenceLimitDiscoveryPolicy
12
- from unit_tests.sources.file_based.in_memory_files_source import InMemoryFilesSource
13
- from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder
14
- from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenario, TestScenarioBuilder
15
-
16
- single_csv_scenario: TestScenario[InMemoryFilesSource] = (
17
- TestScenarioBuilder[InMemoryFilesSource]()
18
- .set_name("single_csv_scenario")
19
- .set_config(
20
- {
21
- "streams": [
22
- {
23
- "name": "stream1",
24
- "format": {"filetype": "csv"},
25
- "globs": ["*"],
26
- "validation_policy": "Emit Record",
27
- }
28
- ],
29
- "start_date": "2023-06-04T03:54:07.000000Z",
30
- }
31
- )
32
- .set_source_builder(
33
- FileBasedSourceBuilder()
34
- .set_files(
35
- {
36
- "a.csv": {
37
- "contents": [
38
- ("col1", "col2"),
39
- ("val11", "val12"),
40
- ("val21", "val22"),
41
- ],
42
- "last_modified": "2023-06-05T03:54:07.000Z",
43
- }
44
- }
45
- )
46
- .set_file_type("csv")
47
- )
48
- .set_expected_spec(
49
- {
50
- "documentationUrl": "https://docs.airbyte.com/integrations/sources/in_memory_files",
51
- "connectionSpecification": {
52
- "title": "InMemorySpec",
53
- "description": "Used during spec; allows the developer to configure the cloud provider specific options\nthat are needed when users configure a file-based source.",
54
- "type": "object",
55
- "properties": {
56
- "start_date": {
57
- "title": "Start Date",
58
- "description": "UTC date and time in the format 2017-01-25T00:00:00.000000Z. Any file modified before this date will not be replicated.",
59
- "examples": ["2021-01-01T00:00:00.000000Z"],
60
- "format": "date-time",
61
- "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{6}Z$",
62
- "pattern_descriptor": "YYYY-MM-DDTHH:mm:ss.SSSSSSZ",
63
- "order": 1,
64
- "type": "string",
65
- },
66
- "streams": {
67
- "title": "The list of streams to sync",
68
- "description": 'Each instance of this configuration defines a <a href="https://docs.airbyte.com/cloud/core-concepts#stream">stream</a>. Use this to define which files belong in the stream, their format, and how they should be parsed and validated. When sending data to warehouse destination such as Snowflake or BigQuery, each stream is a separate table.',
69
- "order": 10,
70
- "type": "array",
71
- "items": {
72
- "title": "FileBasedStreamConfig",
73
- "type": "object",
74
- "properties": {
75
- "name": {"title": "Name", "description": "The name of the stream.", "type": "string"},
76
- "globs": {
77
- "title": "Globs",
78
- "description": 'The pattern used to specify which files should be selected from the file system. For more information on glob pattern matching look <a href="https://en.wikipedia.org/wiki/Glob_(programming)">here</a>.',
79
- "type": "array",
80
- "items": {"type": "string"},
81
- "order": 1,
82
- "default": ["**"],
83
- },
84
- "legacy_prefix": {
85
- "title": "Legacy Prefix",
86
- "airbyte_hidden": True,
87
- "type": "string",
88
- "description": "The path prefix configured in v3 versions of the S3 connector. This option is deprecated in favor of a single glob.",
89
- },
90
- "validation_policy": {
91
- "title": "Validation Policy",
92
- "description": "The name of the validation policy that dictates sync behavior when a record does not adhere to the stream schema.",
93
- "default": "Emit Record",
94
- "enum": ["Emit Record", "Skip Record", "Wait for Discover"],
95
- },
96
- "input_schema": {
97
- "title": "Input Schema",
98
- "description": "The schema that will be used to validate records extracted from the file. This will override the stream schema that is auto-detected from incoming files.",
99
- "type": "string",
100
- },
101
- "primary_key": {
102
- "title": "Primary Key",
103
- "description": "The column or columns (for a composite key) that serves as the unique identifier of a record. If empty, the primary key will default to the parser's default primary key.",
104
- "type": "string",
105
- "airbyte_hidden": True,
106
- },
107
- "days_to_sync_if_history_is_full": {
108
- "title": "Days To Sync If History Is Full",
109
- "description": "When the state history of the file store is full, syncs will only read files that were last modified in the provided day range.",
110
- "default": 3,
111
- "type": "integer",
112
- },
113
- "format": {
114
- "title": "Format",
115
- "description": "The configuration options that are used to alter how to read incoming files that deviate from the standard formatting.",
116
- "type": "object",
117
- "oneOf": [
118
- {
119
- "title": "Avro Format",
120
- "type": "object",
121
- "properties": {
122
- "filetype": {"title": "Filetype", "default": "avro", "const": "avro", "type": "string"},
123
- "double_as_string": {
124
- "title": "Convert Double Fields to Strings",
125
- "description": "Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers.",
126
- "default": False,
127
- "type": "boolean",
128
- },
129
- },
130
- "required": ["filetype"],
131
- },
132
- {
133
- "title": "CSV Format",
134
- "type": "object",
135
- "properties": {
136
- "filetype": {"title": "Filetype", "default": "csv", "const": "csv", "type": "string"},
137
- "delimiter": {
138
- "title": "Delimiter",
139
- "description": "The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.",
140
- "default": ",",
141
- "type": "string",
142
- },
143
- "quote_char": {
144
- "title": "Quote Character",
145
- "description": "The character used for quoting CSV values. To disallow quoting, make this field blank.",
146
- "default": '"',
147
- "type": "string",
148
- },
149
- "escape_char": {
150
- "title": "Escape Character",
151
- "description": "The character used for escaping special characters. To disallow escaping, leave this field blank.",
152
- "type": "string",
153
- },
154
- "encoding": {
155
- "title": "Encoding",
156
- "description": 'The character encoding of the CSV data. Leave blank to default to <strong>UTF8</strong>. See <a href="https://docs.python.org/3/library/codecs.html#standard-encodings" target="_blank">list of python encodings</a> for allowable options.',
157
- "default": "utf8",
158
- "type": "string",
159
- },
160
- "double_quote": {
161
- "title": "Double Quote",
162
- "description": "Whether two quotes in a quoted CSV value denote a single quote in the data.",
163
- "default": True,
164
- "type": "boolean",
165
- },
166
- "null_values": {
167
- "title": "Null Values",
168
- "description": "A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.",
169
- "default": [],
170
- "type": "array",
171
- "items": {"type": "string"},
172
- "uniqueItems": True,
173
- },
174
- "strings_can_be_null": {
175
- "title": "Strings Can Be Null",
176
- "description": "Whether strings can be interpreted as null values. If true, strings that match the null_values set will be interpreted as null. If false, strings that match the null_values set will be interpreted as the string itself.",
177
- "default": True,
178
- "type": "boolean",
179
- },
180
- "skip_rows_before_header": {
181
- "title": "Skip Rows Before Header",
182
- "description": "The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.",
183
- "default": 0,
184
- "type": "integer",
185
- },
186
- "skip_rows_after_header": {
187
- "title": "Skip Rows After Header",
188
- "description": "The number of rows to skip after the header row.",
189
- "default": 0,
190
- "type": "integer",
191
- },
192
- "header_definition": {
193
- "title": "CSV Header Definition",
194
- "type": "object",
195
- "description": "How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
196
- "default": {"header_definition_type": "From CSV"},
197
- "oneOf": [
198
- {
199
- "title": "From CSV",
200
- "type": "object",
201
- "properties": {
202
- "header_definition_type": {
203
- "title": "Header Definition Type",
204
- "default": "From CSV",
205
- "const": "From CSV",
206
- "type": "string",
207
- },
208
- },
209
- "required": ["header_definition_type"],
210
- },
211
- {
212
- "title": "Autogenerated",
213
- "type": "object",
214
- "properties": {
215
- "header_definition_type": {
216
- "title": "Header Definition Type",
217
- "default": "Autogenerated",
218
- "const": "Autogenerated",
219
- "type": "string",
220
- },
221
- },
222
- "required": ["header_definition_type"],
223
- },
224
- {
225
- "title": "User Provided",
226
- "type": "object",
227
- "properties": {
228
- "header_definition_type": {
229
- "title": "Header Definition Type",
230
- "default": "User Provided",
231
- "const": "User Provided",
232
- "type": "string",
233
- },
234
- "column_names": {
235
- "title": "Column Names",
236
- "description": "The column names that will be used while emitting the CSV records",
237
- "type": "array",
238
- "items": {"type": "string"},
239
- },
240
- },
241
- "required": ["column_names", "header_definition_type"],
242
- },
243
- ],
244
- },
245
- "true_values": {
246
- "title": "True Values",
247
- "description": "A set of case-sensitive strings that should be interpreted as true values.",
248
- "default": ["y", "yes", "t", "true", "on", "1"],
249
- "type": "array",
250
- "items": {"type": "string"},
251
- "uniqueItems": True,
252
- },
253
- "false_values": {
254
- "title": "False Values",
255
- "description": "A set of case-sensitive strings that should be interpreted as false values.",
256
- "default": ["n", "no", "f", "false", "off", "0"],
257
- "type": "array",
258
- "items": {"type": "string"},
259
- "uniqueItems": True,
260
- },
261
- "inference_type": {
262
- "title": "Inference Type",
263
- "description": "How to infer the types of the columns. If none, inference default to strings.",
264
- "default": "None",
265
- "airbyte_hidden": True,
266
- "enum": ["None", "Primitive Types Only"],
267
- },
268
- },
269
- "required": ["filetype"],
270
- },
271
- {
272
- "title": "Jsonl Format",
273
- "type": "object",
274
- "properties": {
275
- "filetype": {"title": "Filetype", "default": "jsonl", "const": "jsonl", "type": "string"}
276
- },
277
- "required": ["filetype"],
278
- },
279
- {
280
- "title": "Parquet Format",
281
- "type": "object",
282
- "properties": {
283
- "filetype": {
284
- "title": "Filetype",
285
- "default": "parquet",
286
- "const": "parquet",
287
- "type": "string",
288
- },
289
- "decimal_as_float": {
290
- "title": "Convert Decimal Fields to Floats",
291
- "description": "Whether to convert decimal fields to floats. There is a loss of precision when converting decimals to floats, so this is not recommended.",
292
- "default": False,
293
- "type": "boolean",
294
- },
295
- },
296
- "required": ["filetype"],
297
- },
298
- {
299
- "title": "Document File Type Format (Experimental)",
300
- "type": "object",
301
- "properties": {
302
- "filetype": {
303
- "title": "Filetype",
304
- "default": "unstructured",
305
- "const": "unstructured",
306
- "type": "string",
307
- },
308
- "skip_unprocessable_files": {
309
- "type": "boolean",
310
- "default": True,
311
- "title": "Skip Unprocessable Files",
312
- "description": "If true, skip files that cannot be parsed and pass the error message along as the _ab_source_file_parse_error field. If false, fail the sync.",
313
- "always_show": True,
314
- },
315
- "strategy": {
316
- "type": "string",
317
- "always_show": True,
318
- "order": 0,
319
- "default": "auto",
320
- "title": "Parsing Strategy",
321
- "enum": ["auto", "fast", "ocr_only", "hi_res"],
322
- "description": "The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf",
323
- },
324
- "processing": {
325
- "title": "Processing",
326
- "description": "Processing configuration",
327
- "default": {"mode": "local"},
328
- "type": "object",
329
- "oneOf": [
330
- {
331
- "title": "Local",
332
- "type": "object",
333
- "properties": {
334
- "mode": {
335
- "title": "Mode",
336
- "default": "local",
337
- "const": "local",
338
- "enum": ["local"],
339
- "type": "string",
340
- }
341
- },
342
- "description": "Process files locally, supporting `fast` and `ocr` modes. This is the default option.",
343
- "required": ["mode"],
344
- },
345
- {
346
- "title": "via API",
347
- "type": "object",
348
- "properties": {
349
- "mode": {
350
- "title": "Mode",
351
- "default": "api",
352
- "const": "api",
353
- "enum": ["api"],
354
- "type": "string",
355
- },
356
- "api_key": {
357
- "title": "API Key",
358
- "description": "The API key to use matching the environment",
359
- "default": "",
360
- "always_show": True,
361
- "airbyte_secret": True,
362
- "type": "string",
363
- },
364
- "api_url": {
365
- "title": "API URL",
366
- "description": "The URL of the unstructured API to use",
367
- "default": "https://api.unstructured.io",
368
- "always_show": True,
369
- "examples": ["https://api.unstructured.com"],
370
- "type": "string",
371
- },
372
- "parameters": {
373
- "title": "Additional URL Parameters",
374
- "description": "List of parameters send to the API",
375
- "default": [],
376
- "always_show": True,
377
- "type": "array",
378
- "items": {
379
- "title": "APIParameterConfigModel",
380
- "type": "object",
381
- "properties": {
382
- "name": {
383
- "title": "Parameter name",
384
- "description": "The name of the unstructured API parameter to use",
385
- "examples": ["combine_under_n_chars", "languages"],
386
- "type": "string",
387
- },
388
- "value": {
389
- "title": "Value",
390
- "description": "The value of the parameter",
391
- "examples": ["true", "hi_res"],
392
- "type": "string",
393
- },
394
- },
395
- "required": ["name", "value"],
396
- },
397
- },
398
- },
399
- "description": "Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured.",
400
- "required": ["mode"],
401
- },
402
- ],
403
- },
404
- },
405
- "description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file.",
406
- "required": ["filetype"],
407
- },
408
- ],
409
- },
410
- "schemaless": {
411
- "title": "Schemaless",
412
- "description": "When enabled, syncs will not validate or structure records against the stream's schema.",
413
- "default": False,
414
- "type": "boolean",
415
- },
416
- },
417
- "required": ["name", "format"],
418
- },
419
- },
420
- },
421
- "required": ["streams"],
422
- },
423
- }
424
- )
425
- .set_expected_catalog(
426
- {
427
- "streams": [
428
- {
429
- "default_cursor_field": ["_ab_source_file_last_modified"],
430
- "json_schema": {
431
- "type": "object",
432
- "properties": {
433
- "col1": {"type": ["null", "string"]},
434
- "col2": {"type": ["null", "string"]},
435
- "_ab_source_file_last_modified": {"type": "string"},
436
- "_ab_source_file_url": {"type": "string"},
437
- },
438
- },
439
- "name": "stream1",
440
- "source_defined_cursor": True,
441
- "supported_sync_modes": ["full_refresh", "incremental"],
442
- }
443
- ]
444
- }
445
- )
446
- .set_expected_records(
447
- [
448
- {
449
- "data": {
450
- "col1": "val11",
451
- "col2": "val12",
452
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
453
- "_ab_source_file_url": "a.csv",
454
- },
455
- "stream": "stream1",
456
- },
457
- {
458
- "data": {
459
- "col1": "val21",
460
- "col2": "val22",
461
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
462
- "_ab_source_file_url": "a.csv",
463
- },
464
- "stream": "stream1",
465
- },
466
- ]
467
- )
468
- ).build()
469
-
470
- csv_analytics_scenario: TestScenario[InMemoryFilesSource] = (
471
- TestScenarioBuilder[InMemoryFilesSource]()
472
- .set_name("csv_analytics")
473
- .set_config(
474
- {
475
- "streams": [
476
- {
477
- "name": "stream1",
478
- "format": {"filetype": "csv"},
479
- "globs": ["a.csv"],
480
- "validation_policy": "Emit Record",
481
- },
482
- {
483
- "name": "stream2",
484
- "format": {"filetype": "csv"},
485
- "globs": ["b.csv"],
486
- "validation_policy": "Emit Record",
487
- }
488
- ]
489
- }
490
- )
491
- .set_source_builder(
492
- FileBasedSourceBuilder()
493
- .set_files(
494
- {
495
- "a.csv": {
496
- "contents": [
497
- ("col1", "col2"),
498
- ("val11a", "val12a"),
499
- ("val21a", "val22a"),
500
- ],
501
- "last_modified": "2023-06-05T03:54:07.000Z",
502
- },
503
- "b.csv": {
504
- "contents": [
505
- ("col1", "col2", "col3"),
506
- ("val11b", "val12b", "val13b"),
507
- ("val21b", "val22b", "val23b"),
508
- ],
509
- "last_modified": "2023-06-05T03:54:07.000Z",
510
- },
511
- }
512
- )
513
- .set_file_type("csv")
514
- )
515
- .set_expected_catalog(
516
- {
517
- "streams": [
518
- {
519
- "default_cursor_field": ["_ab_source_file_last_modified"],
520
- "json_schema": {
521
- "type": "object",
522
- "properties": {
523
- "col1": {"type": ["null", "string"]},
524
- "col2": {"type": ["null", "string"]},
525
- "_ab_source_file_last_modified": {"type": "string"},
526
- "_ab_source_file_url": {"type": "string"},
527
- },
528
- },
529
- "name": "stream1",
530
- "source_defined_cursor": True,
531
- "supported_sync_modes": ["full_refresh", "incremental"],
532
- },
533
- {
534
- "default_cursor_field": ["_ab_source_file_last_modified"],
535
- "json_schema": {
536
- "type": "object",
537
- "properties": {
538
- "col1": {"type": ["null", "string"]},
539
- "col2": {"type": ["null", "string"]},
540
- "col3": {"type": ["null", "string"]},
541
- "_ab_source_file_last_modified": {"type": "string"},
542
- "_ab_source_file_url": {"type": "string"},
543
- },
544
- },
545
- "name": "stream2",
546
- "source_defined_cursor": True,
547
- "supported_sync_modes": ["full_refresh", "incremental"],
548
- }
549
- ]
550
- }
551
- )
552
- .set_expected_records([
553
- {
554
- "data": {
555
- "col1": "val11a",
556
- "col2": "val12a",
557
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
558
- "_ab_source_file_url": "a.csv",
559
- },
560
- "stream": "stream1",
561
- },
562
- {
563
- "data": {
564
- "col1": "val21a",
565
- "col2": "val22a",
566
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
567
- "_ab_source_file_url": "a.csv",
568
- },
569
- "stream": "stream1",
570
- },
571
- {
572
- "data": {
573
- "col1": "val11b",
574
- "col2": "val12b",
575
- "col3": "val13b",
576
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
577
- "_ab_source_file_url": "b.csv",
578
- },
579
- "stream": "stream2",
580
- },
581
- {
582
- "data": {
583
- "col1": "val21b",
584
- "col2": "val22b",
585
- "col3": "val23b",
586
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
587
- "_ab_source_file_url": "b.csv",
588
- },
589
- "stream": "stream2",
590
- },
591
- ])
592
- .set_expected_analytics(
593
- [
594
- AirbyteAnalyticsTraceMessage(type="file-cdk-csv-stream-count", value="2"),
595
- ]
596
- )
597
- ).build()
598
-
599
- multi_csv_scenario: TestScenario[InMemoryFilesSource] = (
600
- TestScenarioBuilder[InMemoryFilesSource]()
601
- .set_name("multi_csv_stream")
602
- .set_config(
603
- {
604
- "streams": [
605
- {
606
- "name": "stream1",
607
- "format": {"filetype": "csv"},
608
- "globs": ["*"],
609
- "validation_policy": "Emit Record",
610
- }
611
- ]
612
- }
613
- )
614
- .set_source_builder(
615
- FileBasedSourceBuilder()
616
- .set_files(
617
- {
618
- "a.csv": {
619
- "contents": [
620
- ("col1", "col2"),
621
- ("val11a", "val12a"),
622
- ("val21a", "val22a"),
623
- ],
624
- "last_modified": "2023-06-05T03:54:07.000Z",
625
- },
626
- "b.csv": {
627
- "contents": [
628
- ("col1", "col2", "col3"),
629
- ("val11b", "val12b", "val13b"),
630
- ("val21b", "val22b", "val23b"),
631
- ],
632
- "last_modified": "2023-06-05T03:54:07.000Z",
633
- },
634
- }
635
- )
636
- .set_file_type("csv")
637
- )
638
- .set_expected_catalog(
639
- {
640
- "streams": [
641
- {
642
- "default_cursor_field": ["_ab_source_file_last_modified"],
643
- "json_schema": {
644
- "type": "object",
645
- "properties": {
646
- "col1": {"type": ["null", "string"]},
647
- "col2": {"type": ["null", "string"]},
648
- "col3": {"type": ["null", "string"]},
649
- "_ab_source_file_last_modified": {"type": "string"},
650
- "_ab_source_file_url": {"type": "string"},
651
- },
652
- },
653
- "name": "stream1",
654
- "source_defined_cursor": True,
655
- "supported_sync_modes": ["full_refresh", "incremental"],
656
- }
657
- ]
658
- }
659
- )
660
- .set_expected_records(
661
- [
662
- {
663
- "data": {
664
- "col1": "val11a",
665
- "col2": "val12a",
666
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
667
- "_ab_source_file_url": "a.csv",
668
- },
669
- "stream": "stream1",
670
- },
671
- {
672
- "data": {
673
- "col1": "val21a",
674
- "col2": "val22a",
675
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
676
- "_ab_source_file_url": "a.csv",
677
- },
678
- "stream": "stream1",
679
- },
680
- {
681
- "data": {
682
- "col1": "val11b",
683
- "col2": "val12b",
684
- "col3": "val13b",
685
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
686
- "_ab_source_file_url": "b.csv",
687
- },
688
- "stream": "stream1",
689
- },
690
- {
691
- "data": {
692
- "col1": "val21b",
693
- "col2": "val22b",
694
- "col3": "val23b",
695
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
696
- "_ab_source_file_url": "b.csv",
697
- },
698
- "stream": "stream1",
699
- },
700
- ]
701
- )
702
- ).build()
703
-
704
- multi_csv_stream_n_file_exceeds_limit_for_inference = (
705
- TestScenarioBuilder[InMemoryFilesSource]()
706
- .set_name("multi_csv_stream_n_file_exceeds_limit_for_inference")
707
- .set_config(
708
- {
709
- "streams": [
710
- {
711
- "name": "stream1",
712
- "format": {"filetype": "csv"},
713
- "globs": ["*"],
714
- "validation_policy": "Emit Record",
715
- }
716
- ]
717
- }
718
- )
719
- .set_source_builder(
720
- FileBasedSourceBuilder()
721
- .set_files(
722
- {
723
- "a.csv": {
724
- "contents": [
725
- ("col1", "col2"),
726
- ("val11a", "val12a"),
727
- ("val21a", "val22a"),
728
- ],
729
- "last_modified": "2023-06-05T03:54:07.000Z",
730
- },
731
- "b.csv": {
732
- "contents": [
733
- ("col1", "col2", "col3"),
734
- ("val11b", "val12b", "val13b"),
735
- ("val21b", "val22b", "val23b"),
736
- ],
737
- "last_modified": "2023-06-05T03:54:07.000Z",
738
- },
739
- }
740
- )
741
- .set_file_type("csv")
742
- .set_discovery_policy(LowInferenceLimitDiscoveryPolicy())
743
- )
744
- .set_expected_catalog(
745
- {
746
- "streams": [
747
- {
748
- "default_cursor_field": ["_ab_source_file_last_modified"],
749
- "json_schema": {
750
- "type": "object",
751
- "properties": {
752
- "col1": {"type": ["null", "string"]},
753
- "col2": {"type": ["null", "string"]},
754
- "_ab_source_file_last_modified": {"type": "string"},
755
- "_ab_source_file_url": {"type": "string"},
756
- },
757
- },
758
- "name": "stream1",
759
- "source_defined_cursor": True,
760
- "supported_sync_modes": ["full_refresh", "incremental"],
761
- }
762
- ]
763
- }
764
- )
765
- .set_expected_records(
766
- [
767
- {
768
- "data": {
769
- "col1": "val11a",
770
- "col2": "val12a",
771
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
772
- "_ab_source_file_url": "a.csv",
773
- },
774
- "stream": "stream1",
775
- },
776
- {
777
- "data": {
778
- "col1": "val21a",
779
- "col2": "val22a",
780
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
781
- "_ab_source_file_url": "a.csv",
782
- },
783
- "stream": "stream1",
784
- },
785
- {
786
- "data": {
787
- "col1": "val11b",
788
- "col2": "val12b",
789
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
790
- "_ab_source_file_url": "b.csv",
791
- },
792
- "stream": "stream1",
793
- },
794
- {
795
- "data": {
796
- "col1": "val21b",
797
- "col2": "val22b",
798
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
799
- "_ab_source_file_url": "b.csv",
800
- },
801
- "stream": "stream1",
802
- },
803
- ]
804
- )
805
- ).build()
806
-
807
- invalid_csv_scenario: TestScenario[InMemoryFilesSource] = (
808
- TestScenarioBuilder[InMemoryFilesSource]()
809
- .set_name("invalid_csv_scenario") # too many values for the number of headers
810
- .set_config(
811
- {
812
- "streams": [
813
- {
814
- "name": "stream1",
815
- "format": {"filetype": "csv"},
816
- "globs": ["*"],
817
- "validation_policy": "Emit Record",
818
- }
819
- ]
820
- }
821
- )
822
- .set_source_builder(
823
- FileBasedSourceBuilder()
824
- .set_files(
825
- {
826
- "a.csv": {
827
- "contents": [
828
- ("col1",),
829
- ("val11", "val12"),
830
- ("val21", "val22"),
831
- ],
832
- "last_modified": "2023-06-05T03:54:07.000Z",
833
- }
834
- }
835
- )
836
- .set_file_type("csv")
837
- )
838
- .set_expected_catalog(
839
- {
840
- "streams": [
841
- {
842
- "default_cursor_field": ["_ab_source_file_last_modified"],
843
- "json_schema": {
844
- "type": "object",
845
- "properties": {
846
- "col1": {"type": ["null", "string"]},
847
- "col2": {"type": ["null", "string"]},
848
- "_ab_source_file_last_modified": {"type": "string"},
849
- "_ab_source_file_url": {"type": "string"},
850
- },
851
- },
852
- "name": "stream1",
853
- "source_defined_cursor": True,
854
- "supported_sync_modes": ["full_refresh", "incremental"],
855
- }
856
- ]
857
- }
858
- )
859
- .set_expected_records([])
860
- .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
861
- .set_expected_logs(
862
- {
863
- "read": [
864
- {
865
- "level": "ERROR",
866
- "message": f"{FileBasedSourceError.INVALID_SCHEMA_ERROR.value} stream=stream1 file=a.csv line_no=1 n_skipped=0",
867
- },
868
- ]
869
- }
870
- )
871
- .set_expected_read_error(
872
- AirbyteTracedException,
873
- "Please check the logged errors for more information.",
874
- )
875
- ).build()
876
-
877
- invalid_csv_multi_scenario: TestScenario[InMemoryFilesSource] = (
878
- TestScenarioBuilder[InMemoryFilesSource]()
879
- .set_name("invalid_csv_multi_scenario") # too many values for the number of headers
880
- .set_config(
881
- {
882
- "streams": [
883
- {
884
- "name": "stream1",
885
- "format": {"filetype": "csv"},
886
- "globs": ["*"],
887
- "validation_policy": "Emit Record",
888
- },
889
- {
890
- "name": "stream2",
891
- "format": {"filetype": "csv"},
892
- "globs": ["b.csv"],
893
- "validation_policy": "Emit Record",
894
- },
895
- ]
896
- }
897
- )
898
- .set_source_builder(
899
- FileBasedSourceBuilder()
900
- .set_files(
901
- {
902
- "a.csv": {
903
- "contents": [
904
- ("col1",),
905
- ("val11", "val12"),
906
- ("val21", "val22"),
907
- ],
908
- "last_modified": "2023-06-05T03:54:07.000Z",
909
- },
910
- "b.csv": {
911
- "contents": [
912
- ("col3",),
913
- ("val13b", "val14b"),
914
- ("val23b", "val24b"),
915
- ],
916
- "last_modified": "2023-06-05T03:54:07.000Z",
917
- },
918
- }
919
- )
920
- .set_file_type("csv")
921
- )
922
- .set_expected_catalog(
923
- {
924
- "streams": [
925
- {
926
- "default_cursor_field": ["_ab_source_file_last_modified"],
927
- "json_schema": {
928
- "type": "object",
929
- "properties": {
930
- "col1": {"type": ["null", "string"]},
931
- "col2": {"type": ["null", "string"]},
932
- "_ab_source_file_last_modified": {"type": "string"},
933
- "_ab_source_file_url": {"type": "string"},
934
- },
935
- },
936
- "name": "stream1",
937
- "source_defined_cursor": True,
938
- "supported_sync_modes": ["full_refresh", "incremental"],
939
- },
940
- {
941
- "json_schema": {
942
- "type": "object",
943
- "properties": {
944
- "col3": {"type": ["null", "string"]},
945
- "_ab_source_file_last_modified": {"type": "string"},
946
- "_ab_source_file_url": {"type": "string"},
947
- },
948
- },
949
- "name": "stream2",
950
- "source_defined_cursor": True,
951
- "default_cursor_field": ["_ab_source_file_last_modified"],
952
- "supported_sync_modes": ["full_refresh", "incremental"],
953
- },
954
- ]
955
- }
956
- )
957
- .set_expected_records([])
958
- .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
959
- .set_expected_logs(
960
- {
961
- "read": [
962
- {
963
- "level": "ERROR",
964
- "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=1 n_skipped=0",
965
- },
966
- {
967
- "level": "ERROR",
968
- "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream2 file=b.csv line_no=1 n_skipped=0",
969
- },
970
- ]
971
- }
972
- )
973
- .set_expected_read_error(AirbyteTracedException, "Please check the logged errors for more information.")
974
- ).build()
975
-
976
- csv_single_stream_scenario: TestScenario[InMemoryFilesSource] = (
977
- TestScenarioBuilder[InMemoryFilesSource]()
978
- .set_name("csv_single_stream_scenario")
979
- .set_config(
980
- {
981
- "streams": [
982
- {
983
- "name": "stream1",
984
- "format": {"filetype": "csv"},
985
- "globs": ["*.csv"],
986
- "validation_policy": "Emit Record",
987
- }
988
- ]
989
- }
990
- )
991
- .set_source_builder(
992
- FileBasedSourceBuilder()
993
- .set_files(
994
- {
995
- "a.csv": {
996
- "contents": [
997
- ("col1", "col2"),
998
- ("val11a", "val12a"),
999
- ("val21a", "val22a"),
1000
- ],
1001
- "last_modified": "2023-06-05T03:54:07.000Z",
1002
- },
1003
- "b.jsonl": {
1004
- "contents": [
1005
- {"col1": "val11b", "col2": "val12b", "col3": "val13b"},
1006
- {"col1": "val12b", "col2": "val22b", "col3": "val23b"},
1007
- ],
1008
- "last_modified": "2023-06-05T03:54:07.000Z",
1009
- },
1010
- }
1011
- )
1012
- .set_file_type("csv")
1013
- )
1014
- .set_expected_catalog(
1015
- {
1016
- "streams": [
1017
- {
1018
- "json_schema": {
1019
- "type": "object",
1020
- "properties": {
1021
- "col1": {"type": ["null", "string"]},
1022
- "col2": {"type": ["null", "string"]},
1023
- "_ab_source_file_last_modified": {"type": "string"},
1024
- "_ab_source_file_url": {"type": "string"},
1025
- },
1026
- },
1027
- "name": "stream1",
1028
- "supported_sync_modes": ["full_refresh", "incremental"],
1029
- "source_defined_cursor": True,
1030
- "default_cursor_field": ["_ab_source_file_last_modified"],
1031
- }
1032
- ]
1033
- }
1034
- )
1035
- .set_expected_records(
1036
- [
1037
- {
1038
- "data": {
1039
- "col1": "val11a",
1040
- "col2": "val12a",
1041
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1042
- "_ab_source_file_url": "a.csv",
1043
- },
1044
- "stream": "stream1",
1045
- },
1046
- {
1047
- "data": {
1048
- "col1": "val21a",
1049
- "col2": "val22a",
1050
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1051
- "_ab_source_file_url": "a.csv",
1052
- },
1053
- "stream": "stream1",
1054
- },
1055
- ]
1056
- )
1057
- ).build()
1058
-
1059
- csv_multi_stream_scenario: TestScenario[InMemoryFilesSource] = (
1060
- TestScenarioBuilder[InMemoryFilesSource]()
1061
- .set_name("csv_multi_stream")
1062
- .set_config(
1063
- {
1064
- "streams": [
1065
- {
1066
- "name": "stream1",
1067
- "format": {"filetype": "csv"},
1068
- "globs": ["*.csv"],
1069
- "validation_policy": "Emit Record",
1070
- },
1071
- {
1072
- "name": "stream2",
1073
- "format": {"filetype": "csv"},
1074
- "globs": ["b.csv"],
1075
- "validation_policy": "Emit Record",
1076
- },
1077
- ]
1078
- }
1079
- )
1080
- .set_source_builder(
1081
- FileBasedSourceBuilder()
1082
- .set_files(
1083
- {
1084
- "a.csv": {
1085
- "contents": [
1086
- ("col1", "col2"),
1087
- ("val11a", "val12a"),
1088
- ("val21a", "val22a"),
1089
- ],
1090
- "last_modified": "2023-06-05T03:54:07.000Z",
1091
- },
1092
- "b.csv": {
1093
- "contents": [
1094
- ("col3",),
1095
- ("val13b",),
1096
- ("val23b",),
1097
- ],
1098
- "last_modified": "2023-06-05T03:54:07.000Z",
1099
- },
1100
- }
1101
- )
1102
- .set_file_type("csv")
1103
- )
1104
- .set_expected_catalog(
1105
- {
1106
- "streams": [
1107
- {
1108
- "json_schema": {
1109
- "type": "object",
1110
- "properties": {
1111
- "col1": {"type": ["null", "string"]},
1112
- "col2": {"type": ["null", "string"]},
1113
- "col3": {"type": ["null", "string"]},
1114
- "_ab_source_file_last_modified": {"type": "string"},
1115
- "_ab_source_file_url": {"type": "string"},
1116
- },
1117
- },
1118
- "name": "stream1",
1119
- "supported_sync_modes": ["full_refresh", "incremental"],
1120
- "source_defined_cursor": True,
1121
- "default_cursor_field": ["_ab_source_file_last_modified"],
1122
- },
1123
- {
1124
- "json_schema": {
1125
- "type": "object",
1126
- "properties": {
1127
- "col3": {"type": ["null", "string"]},
1128
- "_ab_source_file_last_modified": {"type": "string"},
1129
- "_ab_source_file_url": {"type": "string"},
1130
- },
1131
- },
1132
- "name": "stream2",
1133
- "source_defined_cursor": True,
1134
- "default_cursor_field": ["_ab_source_file_last_modified"],
1135
- "supported_sync_modes": ["full_refresh", "incremental"],
1136
- },
1137
- ]
1138
- }
1139
- )
1140
- .set_expected_records(
1141
- [
1142
- {
1143
- "data": {
1144
- "col1": "val11a",
1145
- "col2": "val12a",
1146
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1147
- "_ab_source_file_url": "a.csv",
1148
- },
1149
- "stream": "stream1",
1150
- },
1151
- {
1152
- "data": {
1153
- "col1": "val21a",
1154
- "col2": "val22a",
1155
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1156
- "_ab_source_file_url": "a.csv",
1157
- },
1158
- "stream": "stream1",
1159
- },
1160
- {
1161
- "data": {"col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"},
1162
- "stream": "stream1",
1163
- },
1164
- {
1165
- "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"},
1166
- "stream": "stream1",
1167
- },
1168
- {
1169
- "data": {"col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"},
1170
- "stream": "stream2",
1171
- },
1172
- {
1173
- "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"},
1174
- "stream": "stream2",
1175
- },
1176
- ]
1177
- )
1178
- ).build()
1179
-
1180
- csv_custom_format_scenario: TestScenario[InMemoryFilesSource] = (
1181
- TestScenarioBuilder[InMemoryFilesSource]()
1182
- .set_name("csv_custom_format")
1183
- .set_config(
1184
- {
1185
- "streams": [
1186
- {
1187
- "name": "stream1",
1188
- "globs": ["*"],
1189
- "validation_policy": "Emit Record",
1190
- "format": {
1191
- "filetype": "csv",
1192
- "delimiter": "#",
1193
- "quote_char": "|",
1194
- "escape_char": "!",
1195
- "double_quote": True,
1196
- },
1197
- }
1198
- ]
1199
- }
1200
- )
1201
- .set_source_builder(
1202
- FileBasedSourceBuilder()
1203
- .set_files(
1204
- {
1205
- "a.csv": {
1206
- "contents": [
1207
- ("col1", "col2", "col3"),
1208
- ("val11", "val12", "val |13|"),
1209
- ("val21", "val22", "val23"),
1210
- ("val,31", "val |,32|", "val, !!!! 33"),
1211
- ],
1212
- "last_modified": "2023-06-05T03:54:07.000Z",
1213
- }
1214
- }
1215
- )
1216
- .set_file_type("csv")
1217
- .set_file_write_options(
1218
- {
1219
- "delimiter": "#",
1220
- "quotechar": "|",
1221
- }
1222
- )
1223
- )
1224
- .set_expected_catalog(
1225
- {
1226
- "streams": [
1227
- {
1228
- "json_schema": {
1229
- "type": "object",
1230
- "properties": {
1231
- "col1": {
1232
- "type": ["null", "string"],
1233
- },
1234
- "col2": {
1235
- "type": ["null", "string"],
1236
- },
1237
- "col3": {
1238
- "type": ["null", "string"],
1239
- },
1240
- "_ab_source_file_last_modified": {"type": "string"},
1241
- "_ab_source_file_url": {"type": "string"},
1242
- },
1243
- },
1244
- "name": "stream1",
1245
- "source_defined_cursor": True,
1246
- "default_cursor_field": ["_ab_source_file_last_modified"],
1247
- "supported_sync_modes": ["full_refresh", "incremental"],
1248
- }
1249
- ]
1250
- }
1251
- )
1252
- .set_expected_records(
1253
- [
1254
- {
1255
- "data": {
1256
- "col1": "val11",
1257
- "col2": "val12",
1258
- "col3": "val |13|",
1259
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1260
- "_ab_source_file_url": "a.csv",
1261
- },
1262
- "stream": "stream1",
1263
- },
1264
- {
1265
- "data": {
1266
- "col1": "val21",
1267
- "col2": "val22",
1268
- "col3": "val23",
1269
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1270
- "_ab_source_file_url": "a.csv",
1271
- },
1272
- "stream": "stream1",
1273
- },
1274
- {
1275
- "data": {
1276
- "col1": "val,31",
1277
- "col2": "val |,32|",
1278
- "col3": "val, !! 33",
1279
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1280
- "_ab_source_file_url": "a.csv",
1281
- },
1282
- "stream": "stream1",
1283
- },
1284
- ]
1285
- )
1286
- ).build()
1287
-
1288
- multi_stream_custom_format = (
1289
- TestScenarioBuilder[InMemoryFilesSource]()
1290
- .set_name("multi_stream_custom_format_scenario")
1291
- .set_config(
1292
- {
1293
- "streams": [
1294
- {
1295
- "name": "stream1",
1296
- "globs": ["*.csv"],
1297
- "validation_policy": "Emit Record",
1298
- "format": {"filetype": "csv", "delimiter": "#", "escape_char": "!", "double_quote": True, "newlines_in_values": False},
1299
- },
1300
- {
1301
- "name": "stream2",
1302
- "globs": ["b.csv"],
1303
- "validation_policy": "Emit Record",
1304
- "format": {
1305
- "filetype": "csv",
1306
- "delimiter": "#",
1307
- "escape_char": "@",
1308
- "double_quote": True,
1309
- "newlines_in_values": False,
1310
- },
1311
- },
1312
- ]
1313
- }
1314
- )
1315
- .set_source_builder(
1316
- FileBasedSourceBuilder()
1317
- .set_files(
1318
- {
1319
- "a.csv": {
1320
- "contents": [
1321
- ("col1", "col2"),
1322
- ("val11a", "val !! 12a"),
1323
- ("val !! 21a", "val22a"),
1324
- ],
1325
- "last_modified": "2023-06-05T03:54:07.000Z",
1326
- },
1327
- "b.csv": {
1328
- "contents": [
1329
- ("col3",),
1330
- ("val @@@@ 13b",),
1331
- ("val23b",),
1332
- ],
1333
- "last_modified": "2023-06-05T03:54:07.000Z",
1334
- },
1335
- }
1336
- )
1337
- .set_file_type("csv")
1338
- .set_file_write_options(
1339
- {
1340
- "delimiter": "#",
1341
- }
1342
- )
1343
- )
1344
- .set_expected_catalog(
1345
- {
1346
- "streams": [
1347
- {
1348
- "json_schema": {
1349
- "type": "object",
1350
- "properties": {
1351
- "col1": {
1352
- "type": ["null", "string"],
1353
- },
1354
- "col2": {
1355
- "type": ["null", "string"],
1356
- },
1357
- "col3": {
1358
- "type": ["null", "string"],
1359
- },
1360
- "_ab_source_file_last_modified": {"type": "string"},
1361
- "_ab_source_file_url": {"type": "string"},
1362
- },
1363
- },
1364
- "name": "stream1",
1365
- "supported_sync_modes": ["full_refresh", "incremental"],
1366
- "source_defined_cursor": True,
1367
- "default_cursor_field": ["_ab_source_file_last_modified"],
1368
- },
1369
- {
1370
- "json_schema": {
1371
- "type": "object",
1372
- "properties": {
1373
- "col3": {
1374
- "type": ["null", "string"],
1375
- },
1376
- "_ab_source_file_last_modified": {"type": "string"},
1377
- "_ab_source_file_url": {"type": "string"},
1378
- },
1379
- },
1380
- "name": "stream2",
1381
- "source_defined_cursor": True,
1382
- "default_cursor_field": ["_ab_source_file_last_modified"],
1383
- "supported_sync_modes": ["full_refresh", "incremental"],
1384
- },
1385
- ]
1386
- }
1387
- )
1388
- .set_expected_records(
1389
- [
1390
- {
1391
- "data": {
1392
- "col1": "val11a",
1393
- "col2": "val ! 12a",
1394
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1395
- "_ab_source_file_url": "a.csv",
1396
- },
1397
- "stream": "stream1",
1398
- },
1399
- {
1400
- "data": {
1401
- "col1": "val ! 21a",
1402
- "col2": "val22a",
1403
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1404
- "_ab_source_file_url": "a.csv",
1405
- },
1406
- "stream": "stream1",
1407
- },
1408
- {
1409
- "data": {
1410
- "col3": "val @@@@ 13b",
1411
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1412
- "_ab_source_file_url": "b.csv",
1413
- },
1414
- "stream": "stream1",
1415
- },
1416
- {
1417
- "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"},
1418
- "stream": "stream1",
1419
- },
1420
- {
1421
- "data": {
1422
- "col3": "val @@ 13b",
1423
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1424
- "_ab_source_file_url": "b.csv",
1425
- },
1426
- "stream": "stream2",
1427
- },
1428
- {
1429
- "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"},
1430
- "stream": "stream2",
1431
- },
1432
- ]
1433
- )
1434
- ).build()
1435
-
1436
- empty_schema_inference_scenario: TestScenario[InMemoryFilesSource] = (
1437
- TestScenarioBuilder[InMemoryFilesSource]()
1438
- .set_name("empty_schema_inference_scenario")
1439
- .set_config(
1440
- {
1441
- "streams": [
1442
- {
1443
- "name": "stream1",
1444
- "format": {"filetype": "csv"},
1445
- "globs": ["*"],
1446
- "validation_policy": "Emit Record",
1447
- }
1448
- ]
1449
- }
1450
- )
1451
- .set_source_builder(
1452
- FileBasedSourceBuilder()
1453
- .set_files(
1454
- {
1455
- "a.csv": {
1456
- "contents": [
1457
- ("col1", "col2"),
1458
- ("val11", "val12"),
1459
- ("val21", "val22"),
1460
- ],
1461
- "last_modified": "2023-06-05T03:54:07.000Z",
1462
- }
1463
- }
1464
- )
1465
- .set_file_type("csv")
1466
- .set_parsers({CsvFormat: EmptySchemaParser()})
1467
- )
1468
- .set_expected_catalog(
1469
- {
1470
- "streams": [
1471
- {
1472
- "default_cursor_field": ["_ab_source_file_last_modified"],
1473
- "json_schema": {
1474
- "type": "object",
1475
- "properties": {
1476
- "col1": {"type": ["null", "string"]},
1477
- "col2": {"type": ["null", "string"]},
1478
- "_ab_source_file_last_modified": {"type": "string"},
1479
- "_ab_source_file_url": {"type": "string"},
1480
- },
1481
- },
1482
- "name": "stream1",
1483
- "source_defined_cursor": True,
1484
- "supported_sync_modes": ["full_refresh", "incremental"],
1485
- }
1486
- ]
1487
- }
1488
- )
1489
- .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
1490
- ).build()
1491
-
1492
- schemaless_csv_scenario: TestScenario[InMemoryFilesSource] = (
1493
- TestScenarioBuilder[InMemoryFilesSource]()
1494
- .set_name("schemaless_csv_scenario")
1495
- .set_config(
1496
- {
1497
- "streams": [
1498
- {
1499
- "name": "stream1",
1500
- "format": {"filetype": "csv"},
1501
- "globs": ["*"],
1502
- "validation_policy": "Skip Record",
1503
- "schemaless": True,
1504
- }
1505
- ]
1506
- }
1507
- )
1508
- .set_source_builder(
1509
- FileBasedSourceBuilder()
1510
- .set_files(
1511
- {
1512
- "a.csv": {
1513
- "contents": [
1514
- ("col1", "col2"),
1515
- ("val11a", "val12a"),
1516
- ("val21a", "val22a"),
1517
- ],
1518
- "last_modified": "2023-06-05T03:54:07.000Z",
1519
- },
1520
- "b.csv": {
1521
- "contents": [
1522
- ("col1", "col2", "col3"),
1523
- ("val11b", "val12b", "val13b"),
1524
- ("val21b", "val22b", "val23b"),
1525
- ],
1526
- "last_modified": "2023-06-05T03:54:07.000Z",
1527
- },
1528
- }
1529
- )
1530
- .set_file_type("csv")
1531
- )
1532
- .set_expected_catalog(
1533
- {
1534
- "streams": [
1535
- {
1536
- "default_cursor_field": ["_ab_source_file_last_modified"],
1537
- "json_schema": {
1538
- "type": "object",
1539
- "properties": {
1540
- "data": {"type": "object"},
1541
- "_ab_source_file_last_modified": {"type": "string"},
1542
- "_ab_source_file_url": {"type": "string"},
1543
- },
1544
- },
1545
- "name": "stream1",
1546
- "source_defined_cursor": True,
1547
- "supported_sync_modes": ["full_refresh", "incremental"],
1548
- }
1549
- ]
1550
- }
1551
- )
1552
- .set_expected_records(
1553
- [
1554
- {
1555
- "data": {
1556
- "data": {"col1": "val11a", "col2": "val12a"},
1557
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1558
- "_ab_source_file_url": "a.csv",
1559
- },
1560
- "stream": "stream1",
1561
- },
1562
- {
1563
- "data": {
1564
- "data": {"col1": "val21a", "col2": "val22a"},
1565
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1566
- "_ab_source_file_url": "a.csv",
1567
- },
1568
- "stream": "stream1",
1569
- },
1570
- {
1571
- "data": {
1572
- "data": {"col1": "val11b", "col2": "val12b", "col3": "val13b"},
1573
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1574
- "_ab_source_file_url": "b.csv",
1575
- },
1576
- "stream": "stream1",
1577
- },
1578
- {
1579
- "data": {
1580
- "data": {"col1": "val21b", "col2": "val22b", "col3": "val23b"},
1581
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1582
- "_ab_source_file_url": "b.csv",
1583
- },
1584
- "stream": "stream1",
1585
- },
1586
- ]
1587
- )
1588
- ).build()
1589
-
1590
- schemaless_csv_multi_stream_scenario: TestScenario[InMemoryFilesSource] = (
1591
- TestScenarioBuilder[InMemoryFilesSource]()
1592
- .set_name("schemaless_csv_multi_stream_scenario")
1593
- .set_config(
1594
- {
1595
- "streams": [
1596
- {
1597
- "name": "stream1",
1598
- "format": {"filetype": "csv"},
1599
- "globs": ["a.csv"],
1600
- "validation_policy": "Skip Record",
1601
- "schemaless": True,
1602
- },
1603
- {
1604
- "name": "stream2",
1605
- "format": {"filetype": "csv"},
1606
- "globs": ["b.csv"],
1607
- "validation_policy": "Skip Record",
1608
- },
1609
- ]
1610
- }
1611
- )
1612
- .set_source_builder(
1613
- FileBasedSourceBuilder()
1614
- .set_files(
1615
- {
1616
- "a.csv": {
1617
- "contents": [
1618
- ("col1", "col2"),
1619
- ("val11a", "val12a"),
1620
- ("val21a", "val22a"),
1621
- ],
1622
- "last_modified": "2023-06-05T03:54:07.000Z",
1623
- },
1624
- "b.csv": {
1625
- "contents": [
1626
- ("col3",),
1627
- ("val13b",),
1628
- ("val23b",),
1629
- ],
1630
- "last_modified": "2023-06-05T03:54:07.000Z",
1631
- },
1632
- }
1633
- )
1634
- .set_file_type("csv")
1635
- )
1636
- .set_expected_catalog(
1637
- {
1638
- "streams": [
1639
- {
1640
- "json_schema": {
1641
- "type": "object",
1642
- "properties": {
1643
- "data": {"type": "object"},
1644
- "_ab_source_file_last_modified": {"type": "string"},
1645
- "_ab_source_file_url": {"type": "string"},
1646
- },
1647
- },
1648
- "name": "stream1",
1649
- "supported_sync_modes": ["full_refresh", "incremental"],
1650
- "source_defined_cursor": True,
1651
- "default_cursor_field": ["_ab_source_file_last_modified"],
1652
- },
1653
- {
1654
- "json_schema": {
1655
- "type": "object",
1656
- "properties": {
1657
- "col3": {"type": ["null", "string"]},
1658
- "_ab_source_file_last_modified": {"type": "string"},
1659
- "_ab_source_file_url": {"type": "string"},
1660
- },
1661
- },
1662
- "name": "stream2",
1663
- "source_defined_cursor": True,
1664
- "default_cursor_field": ["_ab_source_file_last_modified"],
1665
- "supported_sync_modes": ["full_refresh", "incremental"],
1666
- },
1667
- ]
1668
- }
1669
- )
1670
- .set_expected_records(
1671
- [
1672
- {
1673
- "data": {
1674
- "data": {"col1": "val11a", "col2": "val12a"},
1675
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1676
- "_ab_source_file_url": "a.csv",
1677
- },
1678
- "stream": "stream1",
1679
- },
1680
- {
1681
- "data": {
1682
- "data": {"col1": "val21a", "col2": "val22a"},
1683
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1684
- "_ab_source_file_url": "a.csv",
1685
- },
1686
- "stream": "stream1",
1687
- },
1688
- {
1689
- "data": {"col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"},
1690
- "stream": "stream2",
1691
- },
1692
- {
1693
- "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"},
1694
- "stream": "stream2",
1695
- },
1696
- ]
1697
- )
1698
- ).build()
1699
-
1700
- schemaless_with_user_input_schema_fails_connection_check_scenario: TestScenario[InMemoryFilesSource] = (
1701
- TestScenarioBuilder[InMemoryFilesSource]()
1702
- .set_name("schemaless_with_user_input_schema_fails_connection_check_scenario")
1703
- .set_config(
1704
- {
1705
- "streams": [
1706
- {
1707
- "name": "stream1",
1708
- "format": {"filetype": "csv"},
1709
- "globs": ["*"],
1710
- "validation_policy": "Skip Record",
1711
- "input_schema": '{"col1": "string", "col2": "string", "col3": "string"}',
1712
- "schemaless": True,
1713
- }
1714
- ]
1715
- }
1716
- )
1717
- .set_source_builder(
1718
- FileBasedSourceBuilder()
1719
- .set_files(
1720
- {
1721
- "a.csv": {
1722
- "contents": [
1723
- ("col1", "col2"),
1724
- ("val11a", "val12a"),
1725
- ("val21a", "val22a"),
1726
- ],
1727
- "last_modified": "2023-06-05T03:54:07.000Z",
1728
- },
1729
- "b.csv": {
1730
- "contents": [
1731
- ("col1", "col2", "col3"),
1732
- ("val11b", "val12b", "val13b"),
1733
- ("val21b", "val22b", "val23b"),
1734
- ],
1735
- "last_modified": "2023-06-05T03:54:07.000Z",
1736
- },
1737
- }
1738
- )
1739
- .set_file_type("csv")
1740
- )
1741
- .set_catalog(CatalogBuilder().with_stream("stream1", SyncMode.full_refresh).build())
1742
- .set_expected_catalog(
1743
- {
1744
- "streams": [
1745
- {
1746
- "default_cursor_field": ["_ab_source_file_last_modified"],
1747
- "json_schema": {
1748
- "type": "object",
1749
- "properties": {
1750
- "data": {"type": "object"},
1751
- "_ab_source_file_last_modified": {"type": "string"},
1752
- "_ab_source_file_url": {"type": "string"},
1753
- },
1754
- },
1755
- "name": "stream1",
1756
- "source_defined_cursor": True,
1757
- "supported_sync_modes": ["full_refresh", "incremental"],
1758
- }
1759
- ]
1760
- }
1761
- )
1762
- .set_expected_check_status("FAILED")
1763
- .set_expected_check_error(AirbyteTracedException, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
1764
- .set_expected_discover_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
1765
- .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
1766
- ).build()
1767
-
1768
- schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario: TestScenario[InMemoryFilesSource] = (
1769
- TestScenarioBuilder[InMemoryFilesSource]()
1770
- .set_name("schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario")
1771
- .set_config(
1772
- {
1773
- "streams": [
1774
- {
1775
- "name": "stream1",
1776
- "format": {"filetype": "csv"},
1777
- "globs": ["a.csv"],
1778
- "validation_policy": "Skip Record",
1779
- "schemaless": True,
1780
- "input_schema": '{"col1": "string", "col2": "string", "col3": "string"}',
1781
- },
1782
- {
1783
- "name": "stream2",
1784
- "format": {"filetype": "csv"},
1785
- "globs": ["b.csv"],
1786
- "validation_policy": "Skip Record",
1787
- },
1788
- ]
1789
- }
1790
- )
1791
- .set_source_builder(
1792
- FileBasedSourceBuilder()
1793
- .set_files(
1794
- {
1795
- "a.csv": {
1796
- "contents": [
1797
- ("col1", "col2"),
1798
- ("val11a", "val12a"),
1799
- ("val21a", "val22a"),
1800
- ],
1801
- "last_modified": "2023-06-05T03:54:07.000Z",
1802
- },
1803
- "b.csv": {
1804
- "contents": [
1805
- ("col3",),
1806
- ("val13b",),
1807
- ("val23b",),
1808
- ],
1809
- "last_modified": "2023-06-05T03:54:07.000Z",
1810
- },
1811
- }
1812
- )
1813
- .set_file_type("csv")
1814
- )
1815
- .set_catalog(CatalogBuilder().with_stream("stream1", SyncMode.full_refresh).with_stream("stream2", SyncMode.full_refresh).build())
1816
- .set_expected_catalog(
1817
- {
1818
- "streams": [
1819
- {
1820
- "json_schema": {
1821
- "type": "object",
1822
- "properties": {
1823
- "data": {"type": "object"},
1824
- "_ab_source_file_last_modified": {"type": "string"},
1825
- "_ab_source_file_url": {"type": "string"},
1826
- },
1827
- },
1828
- "name": "stream1",
1829
- "supported_sync_modes": ["full_refresh", "incremental"],
1830
- "source_defined_cursor": True,
1831
- "default_cursor_field": ["_ab_source_file_last_modified"],
1832
- },
1833
- {
1834
- "json_schema": {
1835
- "type": "object",
1836
- "properties": {
1837
- "col3": {"type": ["null", "string"]},
1838
- "_ab_source_file_last_modified": {"type": "string"},
1839
- "_ab_source_file_url": {"type": "string"},
1840
- },
1841
- },
1842
- "name": "stream2",
1843
- "source_defined_cursor": True,
1844
- "default_cursor_field": ["_ab_source_file_last_modified"],
1845
- "supported_sync_modes": ["full_refresh", "incremental"],
1846
- },
1847
- ]
1848
- }
1849
- )
1850
- .set_expected_check_status("FAILED")
1851
- .set_expected_check_error(AirbyteTracedException, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
1852
- .set_expected_discover_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
1853
- .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
1854
- ).build()
1855
-
1856
- csv_string_can_be_null_with_input_schemas_scenario: TestScenario[InMemoryFilesSource] = (
1857
- TestScenarioBuilder[InMemoryFilesSource]()
1858
- .set_name("csv_string_can_be_null_with_input_schema")
1859
- .set_config(
1860
- {
1861
- "streams": [
1862
- {
1863
- "name": "stream1",
1864
- "globs": ["*"],
1865
- "validation_policy": "Emit Record",
1866
- "input_schema": '{"col1": "string", "col2": "string"}',
1867
- "format": {
1868
- "filetype": "csv",
1869
- "null_values": ["null"],
1870
- },
1871
- }
1872
- ],
1873
- "start_date": "2023-06-04T03:54:07.000000Z",
1874
- }
1875
- )
1876
- .set_source_builder(
1877
- FileBasedSourceBuilder()
1878
- .set_files(
1879
- {
1880
- "a.csv": {
1881
- "contents": [
1882
- ("col1", "col2"),
1883
- ("2", "null"),
1884
- ],
1885
- "last_modified": "2023-06-05T03:54:07.000000Z",
1886
- }
1887
- }
1888
- )
1889
- .set_file_type("csv")
1890
- )
1891
- .set_expected_catalog(
1892
- {
1893
- "streams": [
1894
- {
1895
- "default_cursor_field": ["_ab_source_file_last_modified"],
1896
- "json_schema": {
1897
- "type": "object",
1898
- "properties": {
1899
- "col1": {"type": "string"},
1900
- "col2": {"type": "string"},
1901
- "_ab_source_file_last_modified": {"type": "string"},
1902
- "_ab_source_file_url": {"type": "string"},
1903
- },
1904
- },
1905
- "name": "stream1",
1906
- "source_defined_cursor": True,
1907
- "supported_sync_modes": ["full_refresh", "incremental"],
1908
- }
1909
- ]
1910
- }
1911
- )
1912
- .set_expected_records(
1913
- [
1914
- {
1915
- "data": {
1916
- "col1": "2",
1917
- "col2": None,
1918
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1919
- "_ab_source_file_url": "a.csv",
1920
- },
1921
- "stream": "stream1",
1922
- },
1923
- ]
1924
- )
1925
- ).build()
1926
-
1927
- csv_string_are_not_null_if_strings_can_be_null_is_false_scenario: TestScenario[InMemoryFilesSource] = (
1928
- TestScenarioBuilder[InMemoryFilesSource]()
1929
- .set_name("csv_string_are_not_null_if_strings_can_be_null_is_false")
1930
- .set_config(
1931
- {
1932
- "streams": [
1933
- {
1934
- "name": "stream1",
1935
- "globs": ["*"],
1936
- "validation_policy": "Emit Record",
1937
- "input_schema": '{"col1": "string", "col2": "string"}',
1938
- "format": {
1939
- "filetype": "csv",
1940
- "null_values": ["null"],
1941
- "strings_can_be_null": False,
1942
- },
1943
- }
1944
- ],
1945
- "start_date": "2023-06-04T03:54:07.000000Z",
1946
- }
1947
- )
1948
- .set_source_builder(
1949
- FileBasedSourceBuilder()
1950
- .set_files(
1951
- {
1952
- "a.csv": {
1953
- "contents": [
1954
- ("col1", "col2"),
1955
- ("2", "null"),
1956
- ],
1957
- "last_modified": "2023-06-05T03:54:07.000000Z",
1958
- }
1959
- }
1960
- )
1961
- .set_file_type("csv")
1962
- )
1963
- .set_expected_catalog(
1964
- {
1965
- "streams": [
1966
- {
1967
- "default_cursor_field": ["_ab_source_file_last_modified"],
1968
- "json_schema": {
1969
- "type": "object",
1970
- "properties": {
1971
- "col1": {"type": "string"},
1972
- "col2": {"type": "string"},
1973
- "_ab_source_file_last_modified": {"type": "string"},
1974
- "_ab_source_file_url": {"type": "string"},
1975
- },
1976
- },
1977
- "name": "stream1",
1978
- "source_defined_cursor": True,
1979
- "supported_sync_modes": ["full_refresh", "incremental"],
1980
- }
1981
- ]
1982
- }
1983
- )
1984
- .set_expected_records(
1985
- [
1986
- {
1987
- "data": {
1988
- "col1": "2",
1989
- "col2": "null",
1990
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
1991
- "_ab_source_file_url": "a.csv",
1992
- },
1993
- "stream": "stream1",
1994
- },
1995
- ]
1996
- )
1997
- ).build()
1998
-
1999
- csv_string_not_null_if_no_null_values_scenario: TestScenario[InMemoryFilesSource] = (
2000
- TestScenarioBuilder[InMemoryFilesSource]()
2001
- .set_name("csv_string_not_null_if_no_null_values")
2002
- .set_config(
2003
- {
2004
- "streams": [
2005
- {
2006
- "name": "stream1",
2007
- "globs": ["*"],
2008
- "validation_policy": "Emit Record",
2009
- "format": {
2010
- "filetype": "csv",
2011
- },
2012
- }
2013
- ],
2014
- "start_date": "2023-06-04T03:54:07.000000Z",
2015
- }
2016
- )
2017
- .set_source_builder(
2018
- FileBasedSourceBuilder()
2019
- .set_files(
2020
- {
2021
- "a.csv": {
2022
- "contents": [
2023
- ("col1", "col2"),
2024
- ("2", "null"),
2025
- ],
2026
- "last_modified": "2023-06-05T03:54:07.000Z",
2027
- }
2028
- }
2029
- )
2030
- .set_file_type("csv")
2031
- )
2032
- .set_expected_catalog(
2033
- {
2034
- "streams": [
2035
- {
2036
- "default_cursor_field": ["_ab_source_file_last_modified"],
2037
- "json_schema": {
2038
- "type": "object",
2039
- "properties": {
2040
- "col1": {"type": ["null", "string"]},
2041
- "col2": {"type": ["null", "string"]},
2042
- "_ab_source_file_last_modified": {"type": "string"},
2043
- "_ab_source_file_url": {"type": "string"},
2044
- },
2045
- },
2046
- "name": "stream1",
2047
- "source_defined_cursor": True,
2048
- "supported_sync_modes": ["full_refresh", "incremental"],
2049
- }
2050
- ]
2051
- }
2052
- )
2053
- .set_expected_records(
2054
- [
2055
- {
2056
- "data": {
2057
- "col1": "2",
2058
- "col2": "null",
2059
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2060
- "_ab_source_file_url": "a.csv",
2061
- },
2062
- "stream": "stream1",
2063
- },
2064
- ]
2065
- )
2066
- ).build()
2067
-
2068
- csv_strings_can_be_null_not_quoted_scenario: TestScenario[InMemoryFilesSource] = (
2069
- TestScenarioBuilder[InMemoryFilesSource]()
2070
- .set_name("csv_strings_can_be_null_no_input_schema")
2071
- .set_config(
2072
- {
2073
- "streams": [
2074
- {
2075
- "name": "stream1",
2076
- "globs": ["*"],
2077
- "validation_policy": "Emit Record",
2078
- "format": {"filetype": "csv", "null_values": ["null"]},
2079
- }
2080
- ],
2081
- "start_date": "2023-06-04T03:54:07.000000Z",
2082
- }
2083
- )
2084
- .set_source_builder(
2085
- FileBasedSourceBuilder()
2086
- .set_files(
2087
- {
2088
- "a.csv": {
2089
- "contents": [
2090
- ("col1", "col2"),
2091
- ("2", "null"),
2092
- ],
2093
- "last_modified": "2023-06-05T03:54:07.000Z",
2094
- }
2095
- }
2096
- )
2097
- .set_file_type("csv")
2098
- )
2099
- .set_expected_catalog(
2100
- {
2101
- "streams": [
2102
- {
2103
- "default_cursor_field": ["_ab_source_file_last_modified"],
2104
- "json_schema": {
2105
- "type": "object",
2106
- "properties": {
2107
- "col1": {"type": ["null", "string"]},
2108
- "col2": {"type": ["null", "string"]},
2109
- "_ab_source_file_last_modified": {"type": "string"},
2110
- "_ab_source_file_url": {"type": "string"},
2111
- },
2112
- },
2113
- "name": "stream1",
2114
- "source_defined_cursor": True,
2115
- "supported_sync_modes": ["full_refresh", "incremental"],
2116
- }
2117
- ]
2118
- }
2119
- )
2120
- .set_expected_records(
2121
- [
2122
- {
2123
- "data": {
2124
- "col1": "2",
2125
- "col2": None,
2126
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2127
- "_ab_source_file_url": "a.csv",
2128
- },
2129
- "stream": "stream1",
2130
- },
2131
- ]
2132
- )
2133
- ).build()
2134
-
2135
- csv_newline_in_values_quoted_value_scenario: TestScenario[InMemoryFilesSource] = (
2136
- TestScenarioBuilder[InMemoryFilesSource]()
2137
- .set_name("csv_newline_in_values_quoted_value")
2138
- .set_config(
2139
- {
2140
- "streams": [
2141
- {
2142
- "name": "stream1",
2143
- "globs": ["*"],
2144
- "validation_policy": "Emit Record",
2145
- "format": {
2146
- "filetype": "csv",
2147
- },
2148
- }
2149
- ],
2150
- "start_date": "2023-06-04T03:54:07.000000Z",
2151
- }
2152
- )
2153
- .set_source_builder(
2154
- FileBasedSourceBuilder()
2155
- .set_files(
2156
- {
2157
- "a.csv": {
2158
- "contents": [
2159
- '''"col1","col2"''',
2160
- '''"2","val\n2"''',
2161
- ],
2162
- "last_modified": "2023-06-05T03:54:07.000Z",
2163
- }
2164
- }
2165
- )
2166
- .set_file_type("csv")
2167
- )
2168
- .set_expected_catalog(
2169
- {
2170
- "streams": [
2171
- {
2172
- "default_cursor_field": ["_ab_source_file_last_modified"],
2173
- "json_schema": {
2174
- "type": "object",
2175
- "properties": {
2176
- "col1": {"type": ["null", "string"]},
2177
- "col2": {"type": ["null", "string"]},
2178
- "_ab_source_file_last_modified": {"type": "string"},
2179
- "_ab_source_file_url": {"type": "string"},
2180
- },
2181
- },
2182
- "name": "stream1",
2183
- "source_defined_cursor": True,
2184
- "supported_sync_modes": ["full_refresh", "incremental"],
2185
- }
2186
- ]
2187
- }
2188
- )
2189
- .set_expected_records(
2190
- [
2191
- {
2192
- "data": {
2193
- "col1": "2",
2194
- "col2": "val\n2",
2195
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2196
- "_ab_source_file_url": "a.csv",
2197
- },
2198
- "stream": "stream1",
2199
- },
2200
- ]
2201
- )
2202
- ).build()
2203
-
2204
- csv_newline_in_values_not_quoted_scenario: TestScenario[InMemoryFilesSource] = (
2205
- TestScenarioBuilder[InMemoryFilesSource]()
2206
- .set_name("csv_newline_in_values_not_quoted")
2207
- .set_config(
2208
- {
2209
- "streams": [
2210
- {
2211
- "name": "stream1",
2212
- "globs": ["*"],
2213
- "validation_policy": "Emit Record",
2214
- "format": {
2215
- "filetype": "csv",
2216
- },
2217
- }
2218
- ],
2219
- "start_date": "2023-06-04T03:54:07.000000Z",
2220
- }
2221
- )
2222
- .set_source_builder(
2223
- FileBasedSourceBuilder()
2224
- .set_files(
2225
- {
2226
- "a.csv": {
2227
- "contents": [
2228
- """col1,col2""",
2229
- """2,val\n2""",
2230
- ],
2231
- "last_modified": "2023-06-05T03:54:07.000Z",
2232
- }
2233
- }
2234
- )
2235
- .set_file_type("csv")
2236
- )
2237
- .set_expected_catalog(
2238
- {
2239
- "streams": [
2240
- {
2241
- "default_cursor_field": ["_ab_source_file_last_modified"],
2242
- "json_schema": {
2243
- "type": "object",
2244
- "properties": {
2245
- "col1": {"type": ["null", "string"]},
2246
- "col2": {"type": ["null", "string"]},
2247
- "_ab_source_file_last_modified": {"type": "string"},
2248
- "_ab_source_file_url": {"type": "string"},
2249
- },
2250
- },
2251
- "name": "stream1",
2252
- "source_defined_cursor": True,
2253
- "supported_sync_modes": ["full_refresh", "incremental"],
2254
- }
2255
- ]
2256
- }
2257
- )
2258
- .set_expected_records(
2259
- [
2260
- # Note that the value for col2 is truncated to "val" because the newline is not escaped
2261
- {
2262
- "data": {
2263
- "col1": "2",
2264
- "col2": "val",
2265
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2266
- "_ab_source_file_url": "a.csv",
2267
- },
2268
- "stream": "stream1",
2269
- },
2270
- ]
2271
- )
2272
- .set_expected_read_error(
2273
- AirbyteTracedException,
2274
- f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=2 n_skipped=0",
2275
- )
2276
- .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
2277
- .set_expected_read_error(
2278
- AirbyteTracedException,
2279
- "Please check the logged errors for more information.",
2280
- )
2281
- ).build()
2282
-
2283
- csv_escape_char_is_set_scenario: TestScenario[InMemoryFilesSource] = (
2284
- TestScenarioBuilder[InMemoryFilesSource]()
2285
- .set_name("csv_escape_char_is_set")
2286
- .set_config(
2287
- {
2288
- "streams": [
2289
- {
2290
- "name": "stream1",
2291
- "globs": ["*"],
2292
- "validation_policy": "Emit Record",
2293
- "format": {
2294
- "filetype": "csv",
2295
- "double_quotes": False,
2296
- "quote_char": '"',
2297
- "delimiter": ",",
2298
- "escape_char": "\\",
2299
- },
2300
- }
2301
- ],
2302
- "start_date": "2023-06-04T03:54:07.000000Z",
2303
- }
2304
- )
2305
- .set_source_builder(
2306
- FileBasedSourceBuilder()
2307
- .set_files(
2308
- {
2309
- "a.csv": {
2310
- "contents": [
2311
- """col1,col2""",
2312
- '''val11,"val\\"2"''',
2313
- ],
2314
- "last_modified": "2023-06-05T03:54:07.000Z",
2315
- }
2316
- }
2317
- )
2318
- .set_file_type("csv")
2319
- )
2320
- .set_expected_catalog(
2321
- {
2322
- "streams": [
2323
- {
2324
- "default_cursor_field": ["_ab_source_file_last_modified"],
2325
- "json_schema": {
2326
- "type": "object",
2327
- "properties": {
2328
- "col1": {"type": ["null", "string"]},
2329
- "col2": {"type": ["null", "string"]},
2330
- "_ab_source_file_last_modified": {"type": "string"},
2331
- "_ab_source_file_url": {"type": "string"},
2332
- },
2333
- },
2334
- "name": "stream1",
2335
- "source_defined_cursor": True,
2336
- "supported_sync_modes": ["full_refresh", "incremental"],
2337
- }
2338
- ]
2339
- }
2340
- )
2341
- .set_expected_records(
2342
- [
2343
- {
2344
- "data": {
2345
- "col1": "val11",
2346
- "col2": 'val"2',
2347
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2348
- "_ab_source_file_url": "a.csv",
2349
- },
2350
- "stream": "stream1",
2351
- },
2352
- ]
2353
- )
2354
- ).build()
2355
-
2356
- csv_double_quote_is_set_scenario: TestScenario[InMemoryFilesSource] = (
2357
- TestScenarioBuilder[InMemoryFilesSource]()
2358
- .set_name("csv_doublequote_is_set")
2359
- # This scenario tests that quotes are properly escaped when double_quotes is True
2360
- .set_config(
2361
- {
2362
- "streams": [
2363
- {
2364
- "name": "stream1",
2365
- "globs": ["*"],
2366
- "validation_policy": "Emit Record",
2367
- "format": {
2368
- "filetype": "csv",
2369
- "double_quotes": True,
2370
- "quote_char": '"',
2371
- "delimiter": ",",
2372
- },
2373
- }
2374
- ],
2375
- "start_date": "2023-06-04T03:54:07.000000Z",
2376
- }
2377
- )
2378
- .set_source_builder(
2379
- FileBasedSourceBuilder()
2380
- .set_files(
2381
- {
2382
- "a.csv": {
2383
- "contents": [
2384
- """col1,col2""",
2385
- '''val11,"val""2"''',
2386
- ],
2387
- "last_modified": "2023-06-05T03:54:07.000Z",
2388
- }
2389
- }
2390
- )
2391
- .set_file_type("csv")
2392
- )
2393
- .set_expected_catalog(
2394
- {
2395
- "streams": [
2396
- {
2397
- "default_cursor_field": ["_ab_source_file_last_modified"],
2398
- "json_schema": {
2399
- "type": "object",
2400
- "properties": {
2401
- "col1": {"type": ["null", "string"]},
2402
- "col2": {"type": ["null", "string"]},
2403
- "_ab_source_file_last_modified": {"type": "string"},
2404
- "_ab_source_file_url": {"type": "string"},
2405
- },
2406
- },
2407
- "name": "stream1",
2408
- "source_defined_cursor": True,
2409
- "supported_sync_modes": ["full_refresh", "incremental"],
2410
- }
2411
- ]
2412
- }
2413
- )
2414
- .set_expected_records(
2415
- [
2416
- {
2417
- "data": {
2418
- "col1": "val11",
2419
- "col2": 'val"2',
2420
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2421
- "_ab_source_file_url": "a.csv",
2422
- },
2423
- "stream": "stream1",
2424
- },
2425
- ]
2426
- )
2427
- ).build()
2428
-
2429
- csv_custom_delimiter_with_escape_char_scenario: TestScenario[InMemoryFilesSource] = (
2430
- TestScenarioBuilder[InMemoryFilesSource]()
2431
- .set_name("csv_custom_delimiter_with_escape_char")
2432
- # This scenario tests that a value can contain the delimiter if it is wrapped in the quote_char
2433
- .set_config(
2434
- {
2435
- "streams": [
2436
- {
2437
- "name": "stream1",
2438
- "globs": ["*"],
2439
- "validation_policy": "Emit Record",
2440
- "format": {"filetype": "csv", "double_quotes": True, "quote_char": "@", "delimiter": "|", "escape_char": "+"},
2441
- }
2442
- ],
2443
- "start_date": "2023-06-04T03:54:07.000000Z",
2444
- }
2445
- )
2446
- .set_source_builder(
2447
- FileBasedSourceBuilder()
2448
- .set_files(
2449
- {
2450
- "a.csv": {
2451
- "contents": [
2452
- """col1|col2""",
2453
- """val"1,1|val+|2""",
2454
- ],
2455
- "last_modified": "2023-06-05T03:54:07.000Z",
2456
- }
2457
- }
2458
- )
2459
- .set_file_type("csv")
2460
- )
2461
- .set_expected_catalog(
2462
- {
2463
- "streams": [
2464
- {
2465
- "default_cursor_field": ["_ab_source_file_last_modified"],
2466
- "json_schema": {
2467
- "type": "object",
2468
- "properties": {
2469
- "col1": {"type": ["null", "string"]},
2470
- "col2": {"type": ["null", "string"]},
2471
- "_ab_source_file_last_modified": {"type": "string"},
2472
- "_ab_source_file_url": {"type": "string"},
2473
- },
2474
- },
2475
- "name": "stream1",
2476
- "source_defined_cursor": True,
2477
- "supported_sync_modes": ["full_refresh", "incremental"],
2478
- }
2479
- ]
2480
- }
2481
- )
2482
- .set_expected_records(
2483
- [
2484
- {
2485
- "data": {
2486
- "col1": 'val"1,1',
2487
- "col2": "val|2",
2488
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2489
- "_ab_source_file_url": "a.csv",
2490
- },
2491
- "stream": "stream1",
2492
- },
2493
- ]
2494
- )
2495
- ).build()
2496
-
2497
- csv_custom_delimiter_in_double_quotes_scenario: TestScenario[InMemoryFilesSource] = (
2498
- TestScenarioBuilder[InMemoryFilesSource]()
2499
- .set_name("csv_custom_delimiter_in_double_quotes")
2500
- # This scenario tests that a value can contain the delimiter if it is wrapped in the quote_char
2501
- .set_config(
2502
- {
2503
- "streams": [
2504
- {
2505
- "name": "stream1",
2506
- "globs": ["*"],
2507
- "validation_policy": "Emit Record",
2508
- "format": {
2509
- "filetype": "csv",
2510
- "double_quotes": True,
2511
- "quote_char": "@",
2512
- "delimiter": "|",
2513
- },
2514
- }
2515
- ],
2516
- "start_date": "2023-06-04T03:54:07.000000Z",
2517
- }
2518
- )
2519
- .set_source_builder(
2520
- FileBasedSourceBuilder()
2521
- .set_files(
2522
- {
2523
- "a.csv": {
2524
- "contents": [
2525
- """col1|col2""",
2526
- """val"1,1|@val|2@""",
2527
- ],
2528
- "last_modified": "2023-06-05T03:54:07.000Z",
2529
- }
2530
- }
2531
- )
2532
- .set_file_type("csv")
2533
- )
2534
- .set_expected_catalog(
2535
- {
2536
- "streams": [
2537
- {
2538
- "default_cursor_field": ["_ab_source_file_last_modified"],
2539
- "json_schema": {
2540
- "type": "object",
2541
- "properties": {
2542
- "col1": {"type": ["null", "string"]},
2543
- "col2": {"type": ["null", "string"]},
2544
- "_ab_source_file_last_modified": {"type": "string"},
2545
- "_ab_source_file_url": {"type": "string"},
2546
- },
2547
- },
2548
- "name": "stream1",
2549
- "source_defined_cursor": True,
2550
- "supported_sync_modes": ["full_refresh", "incremental"],
2551
- }
2552
- ]
2553
- }
2554
- )
2555
- .set_expected_records(
2556
- [
2557
- {
2558
- "data": {
2559
- "col1": 'val"1,1',
2560
- "col2": "val|2",
2561
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2562
- "_ab_source_file_url": "a.csv",
2563
- },
2564
- "stream": "stream1",
2565
- },
2566
- ]
2567
- )
2568
- ).build()
2569
-
2570
- csv_skip_before_header_scenario: TestScenario[InMemoryFilesSource] = (
2571
- TestScenarioBuilder[InMemoryFilesSource]()
2572
- .set_name("csv_skip_before_header")
2573
- .set_config(
2574
- {
2575
- "streams": [
2576
- {
2577
- "name": "stream1",
2578
- "globs": ["*"],
2579
- "validation_policy": "Emit Record",
2580
- "format": {"filetype": "csv", "skip_rows_before_header": 2},
2581
- }
2582
- ],
2583
- "start_date": "2023-06-04T03:54:07.000000Z",
2584
- }
2585
- )
2586
- .set_source_builder(
2587
- FileBasedSourceBuilder()
2588
- .set_files(
2589
- {
2590
- "a.csv": {
2591
- "contents": [
2592
- ("skip_this", "skip_this"),
2593
- ("skip_this_too", "skip_this_too"),
2594
- ("col1", "col2"),
2595
- ("val11", "val12"),
2596
- ],
2597
- "last_modified": "2023-06-05T03:54:07.000Z",
2598
- }
2599
- }
2600
- )
2601
- .set_file_type("csv")
2602
- )
2603
- .set_expected_catalog(
2604
- {
2605
- "streams": [
2606
- {
2607
- "default_cursor_field": ["_ab_source_file_last_modified"],
2608
- "json_schema": {
2609
- "type": "object",
2610
- "properties": {
2611
- "col1": {"type": ["null", "string"]},
2612
- "col2": {"type": ["null", "string"]},
2613
- "_ab_source_file_last_modified": {"type": "string"},
2614
- "_ab_source_file_url": {"type": "string"},
2615
- },
2616
- },
2617
- "name": "stream1",
2618
- "source_defined_cursor": True,
2619
- "supported_sync_modes": ["full_refresh", "incremental"],
2620
- }
2621
- ]
2622
- }
2623
- )
2624
- .set_expected_records(
2625
- [
2626
- {
2627
- "data": {
2628
- "col1": "val11",
2629
- "col2": "val12",
2630
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2631
- "_ab_source_file_url": "a.csv",
2632
- },
2633
- "stream": "stream1",
2634
- },
2635
- ]
2636
- )
2637
- ).build()
2638
-
2639
- csv_skip_after_header_scenario: TestScenario[InMemoryFilesSource] = (
2640
- TestScenarioBuilder[InMemoryFilesSource]()
2641
- .set_name("csv_skip_after_header")
2642
- .set_config(
2643
- {
2644
- "streams": [
2645
- {
2646
- "name": "stream1",
2647
- "globs": ["*"],
2648
- "validation_policy": "Emit Record",
2649
- "format": {"filetype": "csv", "skip_rows_after_header": 2},
2650
- }
2651
- ],
2652
- "start_date": "2023-06-04T03:54:07.000000Z",
2653
- }
2654
- )
2655
- .set_source_builder(
2656
- FileBasedSourceBuilder()
2657
- .set_files(
2658
- {
2659
- "a.csv": {
2660
- "contents": [
2661
- ("col1", "col2"),
2662
- ("skip_this", "skip_this"),
2663
- ("skip_this_too", "skip_this_too"),
2664
- ("val11", "val12"),
2665
- ],
2666
- "last_modified": "2023-06-05T03:54:07.000Z",
2667
- }
2668
- }
2669
- )
2670
- .set_file_type("csv")
2671
- )
2672
- .set_expected_catalog(
2673
- {
2674
- "streams": [
2675
- {
2676
- "default_cursor_field": ["_ab_source_file_last_modified"],
2677
- "json_schema": {
2678
- "type": "object",
2679
- "properties": {
2680
- "col1": {"type": ["null", "string"]},
2681
- "col2": {"type": ["null", "string"]},
2682
- "_ab_source_file_last_modified": {"type": "string"},
2683
- "_ab_source_file_url": {"type": "string"},
2684
- },
2685
- },
2686
- "name": "stream1",
2687
- "source_defined_cursor": True,
2688
- "supported_sync_modes": ["full_refresh", "incremental"],
2689
- }
2690
- ]
2691
- }
2692
- )
2693
- .set_expected_records(
2694
- [
2695
- {
2696
- "data": {
2697
- "col1": "val11",
2698
- "col2": "val12",
2699
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2700
- "_ab_source_file_url": "a.csv",
2701
- },
2702
- "stream": "stream1",
2703
- },
2704
- ]
2705
- )
2706
- ).build()
2707
-
2708
- csv_skip_before_and_after_header_scenario: TestScenario[InMemoryFilesSource] = (
2709
- TestScenarioBuilder[InMemoryFilesSource]()
2710
- .set_name("csv_skip_before_after_header")
2711
- .set_config(
2712
- {
2713
- "streams": [
2714
- {
2715
- "name": "stream1",
2716
- "globs": ["*"],
2717
- "validation_policy": "Emit Record",
2718
- "format": {
2719
- "filetype": "csv",
2720
- "skip_rows_before_header": 1,
2721
- "skip_rows_after_header": 1,
2722
- },
2723
- }
2724
- ],
2725
- "start_date": "2023-06-04T03:54:07.000000Z",
2726
- }
2727
- )
2728
- .set_source_builder(
2729
- FileBasedSourceBuilder()
2730
- .set_files(
2731
- {
2732
- "a.csv": {
2733
- "contents": [
2734
- ("skip_this", "skip_this"),
2735
- ("col1", "col2"),
2736
- ("skip_this_too", "skip_this_too"),
2737
- ("val11", "val12"),
2738
- ],
2739
- "last_modified": "2023-06-05T03:54:07.000Z",
2740
- }
2741
- }
2742
- )
2743
- .set_file_type("csv")
2744
- )
2745
- .set_expected_catalog(
2746
- {
2747
- "streams": [
2748
- {
2749
- "default_cursor_field": ["_ab_source_file_last_modified"],
2750
- "json_schema": {
2751
- "type": "object",
2752
- "properties": {
2753
- "col1": {"type": ["null", "string"]},
2754
- "col2": {"type": ["null", "string"]},
2755
- "_ab_source_file_last_modified": {"type": "string"},
2756
- "_ab_source_file_url": {"type": "string"},
2757
- },
2758
- },
2759
- "name": "stream1",
2760
- "source_defined_cursor": True,
2761
- "supported_sync_modes": ["full_refresh", "incremental"],
2762
- }
2763
- ]
2764
- }
2765
- )
2766
- .set_expected_records(
2767
- [
2768
- {
2769
- "data": {
2770
- "col1": "val11",
2771
- "col2": "val12",
2772
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2773
- "_ab_source_file_url": "a.csv",
2774
- },
2775
- "stream": "stream1",
2776
- },
2777
- ]
2778
- )
2779
- ).build()
2780
-
2781
- csv_autogenerate_column_names_scenario: TestScenario[InMemoryFilesSource] = (
2782
- TestScenarioBuilder[InMemoryFilesSource]()
2783
- .set_name("csv_autogenerate_column_names")
2784
- .set_config(
2785
- {
2786
- "streams": [
2787
- {
2788
- "name": "stream1",
2789
- "globs": ["*"],
2790
- "validation_policy": "Emit Record",
2791
- "format": {
2792
- "filetype": "csv",
2793
- "header_definition": {"header_definition_type": "Autogenerated"},
2794
- },
2795
- }
2796
- ],
2797
- "start_date": "2023-06-04T03:54:07.000000Z",
2798
- }
2799
- )
2800
- .set_source_builder(
2801
- FileBasedSourceBuilder()
2802
- .set_files(
2803
- {
2804
- "a.csv": {
2805
- "contents": [
2806
- ("val11", "val12"),
2807
- ],
2808
- "last_modified": "2023-06-05T03:54:07.000Z",
2809
- }
2810
- }
2811
- )
2812
- .set_file_type("csv")
2813
- )
2814
- .set_expected_catalog(
2815
- {
2816
- "streams": [
2817
- {
2818
- "default_cursor_field": ["_ab_source_file_last_modified"],
2819
- "json_schema": {
2820
- "type": "object",
2821
- "properties": {
2822
- "f0": {"type": ["null", "string"]},
2823
- "f1": {"type": ["null", "string"]},
2824
- "_ab_source_file_last_modified": {"type": "string"},
2825
- "_ab_source_file_url": {"type": "string"},
2826
- },
2827
- },
2828
- "name": "stream1",
2829
- "source_defined_cursor": True,
2830
- "supported_sync_modes": ["full_refresh", "incremental"],
2831
- }
2832
- ]
2833
- }
2834
- )
2835
- .set_expected_records(
2836
- [
2837
- {
2838
- "data": {
2839
- "f0": "val11",
2840
- "f1": "val12",
2841
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2842
- "_ab_source_file_url": "a.csv",
2843
- },
2844
- "stream": "stream1",
2845
- },
2846
- ]
2847
- )
2848
- ).build()
2849
-
2850
- csv_custom_bool_values_scenario: TestScenario[InMemoryFilesSource] = (
2851
- TestScenarioBuilder[InMemoryFilesSource]()
2852
- .set_name("csv_custom_bool_values")
2853
- .set_config(
2854
- {
2855
- "streams": [
2856
- {
2857
- "name": "stream1",
2858
- "globs": ["*"],
2859
- "validation_policy": "Emit Record",
2860
- "input_schema": '{"col1": "boolean", "col2": "boolean"}',
2861
- "format": {
2862
- "filetype": "csv",
2863
- "true_values": ["this_is_true"],
2864
- "false_values": ["this_is_false"],
2865
- },
2866
- }
2867
- ],
2868
- "start_date": "2023-06-04T03:54:07.000000Z",
2869
- }
2870
- )
2871
- .set_source_builder(
2872
- FileBasedSourceBuilder()
2873
- .set_files(
2874
- {
2875
- "a.csv": {
2876
- "contents": [
2877
- ("col1", "col2"),
2878
- ("this_is_true", "this_is_false"),
2879
- ],
2880
- "last_modified": "2023-06-05T03:54:07.000Z",
2881
- }
2882
- }
2883
- )
2884
- .set_file_type("csv")
2885
- )
2886
- .set_expected_catalog(
2887
- {
2888
- "streams": [
2889
- {
2890
- "default_cursor_field": ["_ab_source_file_last_modified"],
2891
- "json_schema": {
2892
- "type": "object",
2893
- "properties": {
2894
- "col1": {"type": "boolean"},
2895
- "col2": {"type": "boolean"},
2896
- "_ab_source_file_last_modified": {"type": "string"},
2897
- "_ab_source_file_url": {"type": "string"},
2898
- },
2899
- },
2900
- "name": "stream1",
2901
- "source_defined_cursor": True,
2902
- "supported_sync_modes": ["full_refresh", "incremental"],
2903
- }
2904
- ]
2905
- }
2906
- )
2907
- .set_expected_records(
2908
- [
2909
- {
2910
- "data": {
2911
- "col1": True,
2912
- "col2": False,
2913
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2914
- "_ab_source_file_url": "a.csv",
2915
- },
2916
- "stream": "stream1",
2917
- },
2918
- ]
2919
- )
2920
- ).build()
2921
-
2922
- csv_custom_null_values_scenario: TestScenario[InMemoryFilesSource] = (
2923
- TestScenarioBuilder[InMemoryFilesSource]()
2924
- .set_name("csv_custom_null_values")
2925
- .set_config(
2926
- {
2927
- "streams": [
2928
- {
2929
- "name": "stream1",
2930
- "globs": ["*"],
2931
- "validation_policy": "Emit Record",
2932
- "input_schema": '{"col1": "boolean", "col2": "string"}',
2933
- "format": {
2934
- "filetype": "csv",
2935
- "null_values": ["null"],
2936
- },
2937
- }
2938
- ],
2939
- "start_date": "2023-06-04T03:54:07.000000Z",
2940
- }
2941
- )
2942
- .set_source_builder(
2943
- FileBasedSourceBuilder()
2944
- .set_files(
2945
- {
2946
- "a.csv": {
2947
- "contents": [
2948
- ("col1", "col2"),
2949
- ("null", "na"),
2950
- ],
2951
- "last_modified": "2023-06-05T03:54:07.000Z",
2952
- }
2953
- }
2954
- )
2955
- .set_file_type("csv")
2956
- )
2957
- .set_expected_catalog(
2958
- {
2959
- "streams": [
2960
- {
2961
- "default_cursor_field": ["_ab_source_file_last_modified"],
2962
- "json_schema": {
2963
- "type": "object",
2964
- "properties": {
2965
- "col1": {"type": "boolean"},
2966
- "col2": {"type": "string"},
2967
- "_ab_source_file_last_modified": {"type": "string"},
2968
- "_ab_source_file_url": {"type": "string"},
2969
- },
2970
- },
2971
- "name": "stream1",
2972
- "source_defined_cursor": True,
2973
- "supported_sync_modes": ["full_refresh", "incremental"],
2974
- }
2975
- ]
2976
- }
2977
- )
2978
- .set_expected_records(
2979
- [
2980
- {
2981
- "data": {
2982
- "col1": None,
2983
- "col2": "na",
2984
- "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
2985
- "_ab_source_file_url": "a.csv",
2986
- },
2987
- "stream": "stream1",
2988
- },
2989
- ]
2990
- )
2991
- ).build()
2992
-
2993
- earlier_csv_scenario: TestScenario[InMemoryFilesSource] = (
2994
- TestScenarioBuilder[InMemoryFilesSource]()
2995
- .set_name("earlier_csv_stream")
2996
- .set_config(
2997
- {
2998
- "streams": [
2999
- {
3000
- "name": "stream1",
3001
- "format": {"filetype": "csv"},
3002
- "globs": ["*"],
3003
- "validation_policy": "Emit Record",
3004
- }
3005
- ],
3006
- "start_date": "2023-06-10T03:54:07.000000Z",
3007
- }
3008
- )
3009
- .set_source_builder(
3010
- FileBasedSourceBuilder()
3011
- .set_files(
3012
- {
3013
- "a.csv": {
3014
- "contents": [
3015
- ("col1", "col2"),
3016
- ("val11", "val12"),
3017
- ("val21", "val22"),
3018
- ],
3019
- "last_modified": "2023-06-05T03:54:07.000000Z",
3020
- }
3021
- }
3022
- )
3023
- .set_file_type("csv")
3024
- )
3025
- .set_expected_check_status("FAILED")
3026
- .set_expected_check_error(AirbyteTracedException, FileBasedSourceError.EMPTY_STREAM.value)
3027
- .set_expected_catalog(
3028
- {
3029
- "streams": [
3030
- {
3031
- "default_cursor_field": ["_ab_source_file_last_modified"],
3032
- "json_schema": {
3033
- "type": "object",
3034
- "properties": {
3035
- "col1": {"type": "string"},
3036
- "col2": {"type": "string"},
3037
- "_ab_source_file_last_modified": {"type": "string"},
3038
- "_ab_source_file_url": {"type": "string"},
3039
- },
3040
- },
3041
- "name": "stream1",
3042
- "source_defined_cursor": True,
3043
- "supported_sync_modes": ["full_refresh", "incremental"],
3044
- }
3045
- ]
3046
- }
3047
- )
3048
- .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
3049
- ).build()
3050
-
3051
- csv_no_records_scenario: TestScenario[InMemoryFilesSource] = (
3052
- TestScenarioBuilder[InMemoryFilesSource]()
3053
- .set_name("csv_empty_no_records")
3054
- .set_config(
3055
- {
3056
- "streams": [
3057
- {
3058
- "name": "stream1",
3059
- "globs": ["*"],
3060
- "validation_policy": "Emit Record",
3061
- "input_schema": '{"col1": "boolean", "col2": "string"}',
3062
- "format": {
3063
- "filetype": "csv",
3064
- "null_values": ["null"],
3065
- },
3066
- }
3067
- ],
3068
- "start_date": "2023-06-04T03:54:07.000000Z",
3069
- }
3070
- )
3071
- .set_source_builder(
3072
- FileBasedSourceBuilder()
3073
- .set_files(
3074
- {
3075
- "a.csv": {
3076
- "contents": [("col1", "col2")], # column headers, but no data rows
3077
- "last_modified": "2023-06-05T03:54:07.000Z",
3078
- }
3079
- }
3080
- )
3081
- .set_file_type("csv")
3082
- )
3083
- .set_expected_catalog(
3084
- {
3085
- "streams": [
3086
- {
3087
- "default_cursor_field": ["_ab_source_file_last_modified"],
3088
- "json_schema": {
3089
- "type": "object",
3090
- "properties": {
3091
- "col1": {"type": "boolean"},
3092
- "col2": {"type": "string"},
3093
- "_ab_source_file_last_modified": {"type": "string"},
3094
- "_ab_source_file_url": {"type": "string"},
3095
- },
3096
- },
3097
- "name": "stream1",
3098
- "source_defined_cursor": True,
3099
- "supported_sync_modes": ["full_refresh", "incremental"],
3100
- }
3101
- ]
3102
- }
3103
- )
3104
- .set_expected_records([])
3105
- ).build()