airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,527 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import csv
6
+ import json
7
+ import logging
8
+ from abc import ABC, abstractmethod
9
+ from collections import defaultdict
10
+ from functools import partial
11
+ from io import IOBase
12
+ from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set, Tuple
13
+ from uuid import uuid4
14
+
15
+ import orjson
16
+
17
+ from airbyte_cdk.models import FailureType
18
+ from airbyte_cdk.sources.file_based.config.csv_format import (
19
+ CsvFormat,
20
+ CsvHeaderAutogenerated,
21
+ CsvHeaderUserProvided,
22
+ InferenceType,
23
+ )
24
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
25
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
26
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
27
+ AbstractFileBasedStreamReader,
28
+ FileReadMode,
29
+ )
30
+ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
31
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
32
+ from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING, SchemaType
33
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
34
+
35
+ DIALECT_NAME = "_config_dialect"
36
+
37
+
38
+ class _CsvReader:
39
+ def read_data(
40
+ self,
41
+ config: FileBasedStreamConfig,
42
+ file: RemoteFile,
43
+ stream_reader: AbstractFileBasedStreamReader,
44
+ logger: logging.Logger,
45
+ file_read_mode: FileReadMode,
46
+ ) -> Generator[Dict[str, Any], None, None]:
47
+ config_format = _extract_format(config)
48
+ lineno = 0
49
+
50
+ # Formats are configured individually per-stream so a unique dialect should be registered for each stream.
51
+ # We don't unregister the dialect because we are lazily parsing each csv file to generate records
52
+ # Give each stream's dialect a unique name; otherwise, when we are doing a concurrent sync we can end up
53
+ # with a race condition where a thread attempts to use a dialect before a separate thread has finished
54
+ # registering it.
55
+ dialect_name = f"{config.name}_{str(uuid4())}_{DIALECT_NAME}"
56
+ csv.register_dialect(
57
+ dialect_name,
58
+ delimiter=config_format.delimiter,
59
+ quotechar=config_format.quote_char,
60
+ escapechar=config_format.escape_char,
61
+ doublequote=config_format.double_quote,
62
+ quoting=csv.QUOTE_MINIMAL,
63
+ )
64
+ with stream_reader.open_file(file, file_read_mode, config_format.encoding, logger) as fp:
65
+ try:
66
+ headers = self._get_headers(fp, config_format, dialect_name)
67
+ except UnicodeError:
68
+ raise AirbyteTracedException(
69
+ message=f"{FileBasedSourceError.ENCODING_ERROR.value} Expected encoding: {config_format.encoding}",
70
+ )
71
+
72
+ rows_to_skip = (
73
+ config_format.skip_rows_before_header
74
+ + (1 if config_format.header_definition.has_header_row() else 0)
75
+ + config_format.skip_rows_after_header
76
+ )
77
+ self._skip_rows(fp, rows_to_skip)
78
+ lineno += rows_to_skip
79
+
80
+ reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=headers) # type: ignore
81
+ try:
82
+ for row in reader:
83
+ lineno += 1
84
+
85
+ # The row was not properly parsed if any of the values are None. This will most likely occur if there are more columns
86
+ # than headers or more headers dans columns
87
+ if None in row:
88
+ if config_format.ignore_errors_on_fields_mismatch:
89
+ logger.error(
90
+ f"Skipping record in line {lineno} of file {file.uri}; invalid CSV row with missing column."
91
+ )
92
+ else:
93
+ raise RecordParseError(
94
+ FileBasedSourceError.ERROR_PARSING_RECORD_MISMATCHED_COLUMNS,
95
+ filename=file.uri,
96
+ lineno=lineno,
97
+ )
98
+ if None in row.values():
99
+ if config_format.ignore_errors_on_fields_mismatch:
100
+ logger.error(
101
+ f"Skipping record in line {lineno} of file {file.uri}; invalid CSV row with extra column."
102
+ )
103
+ else:
104
+ raise RecordParseError(
105
+ FileBasedSourceError.ERROR_PARSING_RECORD_MISMATCHED_ROWS,
106
+ filename=file.uri,
107
+ lineno=lineno,
108
+ )
109
+ yield row
110
+ finally:
111
+ # due to RecordParseError or GeneratorExit
112
+ csv.unregister_dialect(dialect_name)
113
+
114
+ def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]:
115
+ """
116
+ Assumes the fp is pointing to the beginning of the files and will reset it as such
117
+ """
118
+ # Note that this method assumes the dialect has already been registered if we're parsing the headers
119
+ if isinstance(config_format.header_definition, CsvHeaderUserProvided):
120
+ return config_format.header_definition.column_names
121
+
122
+ if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
123
+ self._skip_rows(
124
+ fp, config_format.skip_rows_before_header + config_format.skip_rows_after_header
125
+ )
126
+ headers = self._auto_generate_headers(fp, dialect_name)
127
+ else:
128
+ # Then read the header
129
+ self._skip_rows(fp, config_format.skip_rows_before_header)
130
+ reader = csv.reader(fp, dialect=dialect_name) # type: ignore
131
+ headers = list(next(reader))
132
+
133
+ fp.seek(0)
134
+ return headers
135
+
136
+ def _auto_generate_headers(self, fp: IOBase, dialect_name: str) -> List[str]:
137
+ """
138
+ Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True.
139
+ See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html
140
+ """
141
+ reader = csv.reader(fp, dialect=dialect_name) # type: ignore
142
+ number_of_columns = len(next(reader)) # type: ignore
143
+ return [f"f{i}" for i in range(number_of_columns)]
144
+
145
+ @staticmethod
146
+ def _skip_rows(fp: IOBase, rows_to_skip: int) -> None:
147
+ """
148
+ Skip rows before the header. This has to be done on the file object itself, not the reader
149
+ """
150
+ for _ in range(rows_to_skip):
151
+ fp.readline()
152
+
153
+
154
+ class CsvParser(FileTypeParser):
155
+ _MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000
156
+
157
+ def __init__(self, csv_reader: Optional[_CsvReader] = None, csv_field_max_bytes: int = 2**31):
158
+ # Increase the maximum length of data that can be parsed in a single CSV field. The default is 128k, which is typically sufficient
159
+ # but given the use of Airbyte in loading a large variety of data it is best to allow for a larger maximum field size to avoid
160
+ # skipping data on load. https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072
161
+ csv.field_size_limit(csv_field_max_bytes)
162
+ self._csv_reader = csv_reader if csv_reader else _CsvReader()
163
+
164
+ def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
165
+ """
166
+ CsvParser does not require config checks, implicit pydantic validation is enough.
167
+ """
168
+ return True, None
169
+
170
+ async def infer_schema(
171
+ self,
172
+ config: FileBasedStreamConfig,
173
+ file: RemoteFile,
174
+ stream_reader: AbstractFileBasedStreamReader,
175
+ logger: logging.Logger,
176
+ ) -> SchemaType:
177
+ input_schema = config.get_input_schema()
178
+ if input_schema:
179
+ return input_schema
180
+
181
+ # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
182
+ # sources will likely require one. Rather than modify the interface now we can wait until the real use case
183
+ config_format = _extract_format(config)
184
+ type_inferrer_by_field: Dict[str, _TypeInferrer] = defaultdict(
185
+ lambda: _JsonTypeInferrer(
186
+ config_format.true_values, config_format.false_values, config_format.null_values
187
+ )
188
+ if config_format.inference_type != InferenceType.NONE
189
+ else _DisabledTypeInferrer()
190
+ )
191
+ data_generator = self._csv_reader.read_data(
192
+ config, file, stream_reader, logger, self.file_read_mode
193
+ )
194
+ read_bytes = 0
195
+ for row in data_generator:
196
+ for header, value in row.items():
197
+ type_inferrer_by_field[header].add_value(value)
198
+ # This is not accurate as a representation of how many bytes were read because csv does some processing on the actual value
199
+ # before returning. Given we would like to be more accurate, we could wrap the IO file using a decorator
200
+ read_bytes += len(value)
201
+ read_bytes += len(row) - 1 # for separators
202
+ if read_bytes >= self._MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE:
203
+ break
204
+
205
+ if not type_inferrer_by_field:
206
+ raise AirbyteTracedException(
207
+ message=f"Could not infer schema as there are no rows in {file.uri}. If having an empty CSV file is expected, ignore this. "
208
+ f"Else, please contact Airbyte.",
209
+ failure_type=FailureType.config_error,
210
+ )
211
+ schema = {
212
+ header.strip(): {"type": type_inferred.infer()}
213
+ for header, type_inferred in type_inferrer_by_field.items()
214
+ }
215
+ data_generator.close()
216
+ return schema
217
+
218
+ def parse_records(
219
+ self,
220
+ config: FileBasedStreamConfig,
221
+ file: RemoteFile,
222
+ stream_reader: AbstractFileBasedStreamReader,
223
+ logger: logging.Logger,
224
+ discovered_schema: Optional[Mapping[str, SchemaType]],
225
+ ) -> Iterable[Dict[str, Any]]:
226
+ line_no = 0
227
+ try:
228
+ config_format = _extract_format(config)
229
+ if discovered_schema:
230
+ property_types = {
231
+ col: prop["type"] for col, prop in discovered_schema["properties"].items()
232
+ }
233
+ deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
234
+ else:
235
+ deduped_property_types = {}
236
+ cast_fn = CsvParser._get_cast_function(
237
+ deduped_property_types, config_format, logger, config.schemaless
238
+ )
239
+ data_generator = self._csv_reader.read_data(
240
+ config, file, stream_reader, logger, self.file_read_mode
241
+ )
242
+ for row in data_generator:
243
+ line_no += 1
244
+ yield CsvParser._to_nullable(
245
+ cast_fn(row),
246
+ deduped_property_types,
247
+ config_format.null_values,
248
+ config_format.strings_can_be_null,
249
+ )
250
+ except RecordParseError as parse_err:
251
+ raise RecordParseError(
252
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no
253
+ ) from parse_err
254
+ finally:
255
+ data_generator.close()
256
+
257
+ @property
258
+ def file_read_mode(self) -> FileReadMode:
259
+ return FileReadMode.READ
260
+
261
+ @staticmethod
262
+ def _get_cast_function(
263
+ deduped_property_types: Mapping[str, str],
264
+ config_format: CsvFormat,
265
+ logger: logging.Logger,
266
+ schemaless: bool,
267
+ ) -> Callable[[Mapping[str, str]], Mapping[str, str]]:
268
+ # Only cast values if the schema is provided
269
+ if deduped_property_types and not schemaless:
270
+ return partial(
271
+ CsvParser._cast_types,
272
+ deduped_property_types=deduped_property_types,
273
+ config_format=config_format,
274
+ logger=logger,
275
+ )
276
+ else:
277
+ # If no schema is provided, yield the rows as they are
278
+ return _no_cast
279
+
280
+ @staticmethod
281
+ def _to_nullable(
282
+ row: Mapping[str, str],
283
+ deduped_property_types: Mapping[str, str],
284
+ null_values: Set[str],
285
+ strings_can_be_null: bool,
286
+ ) -> Dict[str, Optional[str]]:
287
+ nullable = {
288
+ k: None
289
+ if CsvParser._value_is_none(
290
+ v, deduped_property_types.get(k), null_values, strings_can_be_null
291
+ )
292
+ else v
293
+ for k, v in row.items()
294
+ }
295
+ return nullable
296
+
297
+ @staticmethod
298
+ def _value_is_none(
299
+ value: Any,
300
+ deduped_property_type: Optional[str],
301
+ null_values: Set[str],
302
+ strings_can_be_null: bool,
303
+ ) -> bool:
304
+ return value in null_values and (strings_can_be_null or deduped_property_type != "string")
305
+
306
+ @staticmethod
307
+ def _pre_propcess_property_types(property_types: Dict[str, Any]) -> Mapping[str, str]:
308
+ """
309
+ Transform the property types to be non-nullable and remove duplicate types if any.
310
+ Sample input:
311
+ {
312
+ "col1": ["string", "null"],
313
+ "col2": ["string", "string", "null"],
314
+ "col3": "integer"
315
+ }
316
+
317
+ Sample output:
318
+ {
319
+ "col1": "string",
320
+ "col2": "string",
321
+ "col3": "integer",
322
+ }
323
+ """
324
+ output = {}
325
+ for prop, prop_type in property_types.items():
326
+ if isinstance(prop_type, list):
327
+ prop_type_distinct = set(prop_type)
328
+ prop_type_distinct.remove("null")
329
+ if len(prop_type_distinct) != 1:
330
+ raise ValueError(f"Could not get non nullable type from {prop_type}")
331
+ output[prop] = next(iter(prop_type_distinct))
332
+ else:
333
+ output[prop] = prop_type
334
+ return output
335
+
336
+ @staticmethod
337
+ def _cast_types(
338
+ row: Dict[str, str],
339
+ deduped_property_types: Mapping[str, str],
340
+ config_format: CsvFormat,
341
+ logger: logging.Logger,
342
+ ) -> Dict[str, Any]:
343
+ """
344
+ Casts the values in the input 'row' dictionary according to the types defined in the JSON schema.
345
+
346
+ Array and object types are only handled if they can be deserialized as JSON.
347
+
348
+ If any errors are encountered, the value will be emitted as a string.
349
+ """
350
+ warnings = []
351
+ result = {}
352
+
353
+ for key, value in row.items():
354
+ prop_type = deduped_property_types.get(key)
355
+ cast_value: Any = value
356
+
357
+ if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None:
358
+ _, python_type = TYPE_PYTHON_MAPPING[prop_type]
359
+
360
+ if python_type is None:
361
+ if value == "":
362
+ cast_value = None
363
+ else:
364
+ warnings.append(_format_warning(key, value, prop_type))
365
+
366
+ elif python_type is bool:
367
+ try:
368
+ cast_value = _value_to_bool(
369
+ value, config_format.true_values, config_format.false_values
370
+ )
371
+ except ValueError:
372
+ warnings.append(_format_warning(key, value, prop_type))
373
+
374
+ elif python_type is dict:
375
+ try:
376
+ # we don't re-use _value_to_object here because we type the column as object as long as there is only one object
377
+ cast_value = orjson.loads(value)
378
+ except orjson.JSONDecodeError:
379
+ warnings.append(_format_warning(key, value, prop_type))
380
+
381
+ elif python_type is list:
382
+ try:
383
+ cast_value = _value_to_list(value)
384
+ except (ValueError, json.JSONDecodeError):
385
+ warnings.append(_format_warning(key, value, prop_type))
386
+
387
+ elif python_type:
388
+ try:
389
+ cast_value = _value_to_python_type(value, python_type)
390
+ except ValueError:
391
+ warnings.append(_format_warning(key, value, prop_type))
392
+
393
+ result[key] = cast_value
394
+
395
+ if warnings:
396
+ logger.warning(
397
+ f"{FileBasedSourceError.ERROR_CASTING_VALUE.value}: {','.join([w for w in warnings])}",
398
+ )
399
+ return result
400
+
401
+
402
+ class _TypeInferrer(ABC):
403
+ @abstractmethod
404
+ def add_value(self, value: Any) -> None:
405
+ pass
406
+
407
+ @abstractmethod
408
+ def infer(self) -> str:
409
+ pass
410
+
411
+
412
+ class _DisabledTypeInferrer(_TypeInferrer):
413
+ def add_value(self, value: Any) -> None:
414
+ pass
415
+
416
+ def infer(self) -> str:
417
+ return "string"
418
+
419
+
420
+ class _JsonTypeInferrer(_TypeInferrer):
421
+ _NULL_TYPE = "null"
422
+ _BOOLEAN_TYPE = "boolean"
423
+ _INTEGER_TYPE = "integer"
424
+ _NUMBER_TYPE = "number"
425
+ _STRING_TYPE = "string"
426
+
427
+ def __init__(
428
+ self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str]
429
+ ) -> None:
430
+ self._boolean_trues = boolean_trues
431
+ self._boolean_falses = boolean_falses
432
+ self._null_values = null_values
433
+ self._values: Set[str] = set()
434
+
435
+ def add_value(self, value: Any) -> None:
436
+ self._values.add(value)
437
+
438
+ def infer(self) -> str:
439
+ types_by_value = {value: self._infer_type(value) for value in self._values}
440
+ types_excluding_null_values = [
441
+ types for types in types_by_value.values() if self._NULL_TYPE not in types
442
+ ]
443
+ if not types_excluding_null_values:
444
+ # this is highly unusual but we will consider the column as a string
445
+ return self._STRING_TYPE
446
+
447
+ types = set.intersection(*types_excluding_null_values)
448
+ if self._BOOLEAN_TYPE in types:
449
+ return self._BOOLEAN_TYPE
450
+ elif self._INTEGER_TYPE in types:
451
+ return self._INTEGER_TYPE
452
+ elif self._NUMBER_TYPE in types:
453
+ return self._NUMBER_TYPE
454
+ return self._STRING_TYPE
455
+
456
+ def _infer_type(self, value: str) -> Set[str]:
457
+ inferred_types = set()
458
+
459
+ if value in self._null_values:
460
+ inferred_types.add(self._NULL_TYPE)
461
+ if self._is_boolean(value):
462
+ inferred_types.add(self._BOOLEAN_TYPE)
463
+ if self._is_integer(value):
464
+ inferred_types.add(self._INTEGER_TYPE)
465
+ inferred_types.add(self._NUMBER_TYPE)
466
+ elif self._is_number(value):
467
+ inferred_types.add(self._NUMBER_TYPE)
468
+
469
+ inferred_types.add(self._STRING_TYPE)
470
+ return inferred_types
471
+
472
+ def _is_boolean(self, value: str) -> bool:
473
+ try:
474
+ _value_to_bool(value, self._boolean_trues, self._boolean_falses)
475
+ return True
476
+ except ValueError:
477
+ return False
478
+
479
+ @staticmethod
480
+ def _is_integer(value: str) -> bool:
481
+ try:
482
+ _value_to_python_type(value, int)
483
+ return True
484
+ except ValueError:
485
+ return False
486
+
487
+ @staticmethod
488
+ def _is_number(value: str) -> bool:
489
+ try:
490
+ _value_to_python_type(value, float)
491
+ return True
492
+ except ValueError:
493
+ return False
494
+
495
+
496
+ def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> bool:
497
+ if value in true_values:
498
+ return True
499
+ if value in false_values:
500
+ return False
501
+ raise ValueError(f"Value {value} is not a valid boolean value")
502
+
503
+
504
+ def _value_to_list(value: str) -> List[Any]:
505
+ parsed_value = json.loads(value)
506
+ if isinstance(parsed_value, list):
507
+ return parsed_value
508
+ raise ValueError(f"Value {parsed_value} is not a valid list value")
509
+
510
+
511
+ def _value_to_python_type(value: str, python_type: type) -> Any:
512
+ return python_type(value)
513
+
514
+
515
+ def _format_warning(key: str, value: str, expected_type: Optional[Any]) -> str:
516
+ return f"{key}: value={value},expected_type={expected_type}"
517
+
518
+
519
+ def _no_cast(row: Mapping[str, str]) -> Mapping[str, str]:
520
+ return row
521
+
522
+
523
+ def _extract_format(config: FileBasedStreamConfig) -> CsvFormat:
524
+ config_format = config.format
525
+ if not isinstance(config_format, CsvFormat):
526
+ raise ValueError(f"Invalid format config: {config_format}")
527
+ return config_format