airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,159 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from enum import Enum
6
+ from typing import Any, List, Union
7
+
8
+ from airbyte_cdk.models import AirbyteMessage, FailureType
9
+ from airbyte_cdk.utils import AirbyteTracedException
10
+
11
+
12
+ class FileBasedSourceError(Enum):
13
+ EMPTY_STREAM = "No files were identified in the stream. This may be because there are no files in the specified container, or because your glob patterns did not match any files. Please verify that your source contains files last modified after the start_date and that your glob patterns are not overly strict."
14
+ GLOB_PARSE_ERROR = "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
15
+ ENCODING_ERROR = "File encoding error. The configured encoding must match file encoding."
16
+ ERROR_CASTING_VALUE = "Could not cast the value to the expected type."
17
+ ERROR_CASTING_VALUE_UNRECOGNIZED_TYPE = "Could not cast the value to the expected type because the type is not recognized. Valid types are null, array, boolean, integer, number, object, and string."
18
+ ERROR_DECODING_VALUE = "Expected a JSON-decodeable value but could not decode record."
19
+ ERROR_LISTING_FILES = "Error listing files. Please check the credentials provided in the config and verify that they provide permission to list files."
20
+ ERROR_READING_FILE = "Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files."
21
+ ERROR_PARSING_RECORD = "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable."
22
+ ERROR_PARSING_USER_PROVIDED_SCHEMA = (
23
+ "The provided schema could not be transformed into valid JSON Schema."
24
+ )
25
+ ERROR_VALIDATING_RECORD = "One or more records do not pass the schema validation policy. Please modify your input schema, or select a more lenient validation policy."
26
+ ERROR_PARSING_RECORD_MISMATCHED_COLUMNS = "A header field has resolved to `None`. This indicates that the CSV has more rows than the number of header fields. If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
27
+ ERROR_PARSING_RECORD_MISMATCHED_ROWS = "A row's value has resolved to `None`. This indicates that the CSV has more columns in the header field than the number of columns in the row(s). If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
28
+ STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY = "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema."
29
+ NULL_VALUE_IN_SCHEMA = "Error during schema inference: no type was detected for key."
30
+ UNRECOGNIZED_TYPE = "Error during schema inference: unrecognized type."
31
+ SCHEMA_INFERENCE_ERROR = "Error inferring schema from files. Are the files valid?"
32
+ INVALID_SCHEMA_ERROR = "No fields were identified for this schema. This may happen if the stream is empty. Please check your configuration to verify that there are files that match the stream's glob patterns."
33
+ CONFIG_VALIDATION_ERROR = "Error creating stream config object."
34
+ MISSING_SCHEMA = "Expected `json_schema` in the configured catalog but it is missing."
35
+ UNDEFINED_PARSER = "No parser is defined for this file type."
36
+ UNDEFINED_VALIDATION_POLICY = (
37
+ "The validation policy defined in the config does not exist for the source."
38
+ )
39
+
40
+
41
+ class FileBasedErrorsCollector:
42
+ """
43
+ The placeholder for all errors collected.
44
+ """
45
+
46
+ errors: List[AirbyteMessage] = []
47
+
48
+ def yield_and_raise_collected(self) -> Any:
49
+ if self.errors:
50
+ # emit collected logged messages
51
+ yield from self.errors
52
+ # clean the collector
53
+ self.errors.clear()
54
+ # raising the single exception
55
+ raise AirbyteTracedException(
56
+ internal_message="Please check the logged errors for more information.",
57
+ message="Some errors occured while reading from the source.",
58
+ failure_type=FailureType.config_error,
59
+ )
60
+
61
+ def collect(self, logged_error: AirbyteMessage) -> None:
62
+ self.errors.append(logged_error)
63
+
64
+
65
+ class BaseFileBasedSourceError(Exception):
66
+ def __init__(self, error: Union[FileBasedSourceError, str], **kwargs): # type: ignore # noqa
67
+ if isinstance(error, FileBasedSourceError):
68
+ error = FileBasedSourceError(error).value
69
+ super().__init__(
70
+ f"{error} Contact Support if you need assistance.\n{' '.join([f'{k}={v}' for k, v in kwargs.items()])}"
71
+ )
72
+
73
+
74
+ class ConfigValidationError(BaseFileBasedSourceError):
75
+ pass
76
+
77
+
78
+ class InvalidSchemaError(BaseFileBasedSourceError):
79
+ pass
80
+
81
+
82
+ class MissingSchemaError(BaseFileBasedSourceError):
83
+ pass
84
+
85
+
86
+ class NoFilesMatchingError(BaseFileBasedSourceError):
87
+ pass
88
+
89
+
90
+ class RecordParseError(BaseFileBasedSourceError):
91
+ pass
92
+
93
+
94
+ class SchemaInferenceError(BaseFileBasedSourceError):
95
+ pass
96
+
97
+
98
+ class CheckAvailabilityError(BaseFileBasedSourceError):
99
+ pass
100
+
101
+
102
+ class UndefinedParserError(BaseFileBasedSourceError):
103
+ pass
104
+
105
+
106
+ class StopSyncPerValidationPolicy(BaseFileBasedSourceError):
107
+ pass
108
+
109
+
110
+ class ErrorListingFiles(BaseFileBasedSourceError):
111
+ pass
112
+
113
+
114
+ class DuplicatedFilesError(BaseFileBasedSourceError):
115
+ def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
116
+ self._duplicated_files_names = duplicated_files_names
117
+ self._stream_name: str = kwargs["stream"]
118
+ super().__init__(self._format_duplicate_files_error_message(), **kwargs)
119
+
120
+ def _format_duplicate_files_error_message(self) -> str:
121
+ duplicated_files_messages = []
122
+ for duplicated_file in self._duplicated_files_names:
123
+ for duplicated_file_name, file_paths in duplicated_file.items():
124
+ file_duplicated_message = (
125
+ f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
126
+ + "".join(f"\n - {file_paths}")
127
+ )
128
+ duplicated_files_messages.append(file_duplicated_message)
129
+
130
+ error_message = (
131
+ f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
132
+ "Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
133
+ "Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
134
+ + "\n".join(duplicated_files_messages)
135
+ )
136
+
137
+ return error_message
138
+
139
+ def __repr__(self) -> str:
140
+ """Return a string representation of the exception."""
141
+ class_name = self.__class__.__name__
142
+ properties_str = ", ".join(
143
+ f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
144
+ )
145
+ return f"{class_name}({properties_str})"
146
+
147
+
148
+ class CustomFileBasedException(AirbyteTracedException):
149
+ """
150
+ A specialized exception for file-based connectors.
151
+
152
+ This exception is designed to bypass the default error handling in the file-based CDK, allowing the use of custom error messages.
153
+ """
154
+
155
+ pass
156
+
157
+
158
+ class FileSizeLimitError(CustomFileBasedException):
159
+ pass
@@ -0,0 +1,466 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ import traceback
7
+ from abc import ABC
8
+ from collections import Counter
9
+ from typing import Any, Iterator, List, Mapping, Optional, Tuple, Type, Union
10
+
11
+ from pydantic.v1.error_wrappers import ValidationError
12
+
13
+ from airbyte_cdk.logger import AirbyteLogFormatter, init_logger
14
+ from airbyte_cdk.models import (
15
+ AirbyteMessage,
16
+ AirbyteStateMessage,
17
+ AirbyteStream,
18
+ ConfiguredAirbyteCatalog,
19
+ ConnectorSpecification,
20
+ FailureType,
21
+ Level,
22
+ SyncMode,
23
+ )
24
+ from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
25
+ from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
26
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
27
+ from airbyte_cdk.sources.file_based.availability_strategy import (
28
+ AbstractFileBasedAvailabilityStrategy,
29
+ DefaultFileBasedAvailabilityStrategy,
30
+ )
31
+ from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
32
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
33
+ FileBasedStreamConfig,
34
+ ValidationPolicy,
35
+ )
36
+ from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
37
+ include_identities_stream,
38
+ preserve_directory_structure,
39
+ use_file_transfer,
40
+ use_permissions_transfer,
41
+ )
42
+ from airbyte_cdk.sources.file_based.discovery_policy import (
43
+ AbstractDiscoveryPolicy,
44
+ DefaultDiscoveryPolicy,
45
+ )
46
+ from airbyte_cdk.sources.file_based.exceptions import (
47
+ ConfigValidationError,
48
+ FileBasedErrorsCollector,
49
+ FileBasedSourceError,
50
+ )
51
+ from airbyte_cdk.sources.file_based.file_based_stream_permissions_reader import (
52
+ AbstractFileBasedStreamPermissionsReader,
53
+ )
54
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
55
+ from airbyte_cdk.sources.file_based.file_types import default_parsers
56
+ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
57
+ from airbyte_cdk.sources.file_based.schema_validation_policies import (
58
+ DEFAULT_SCHEMA_VALIDATION_POLICIES,
59
+ AbstractSchemaValidationPolicy,
60
+ )
61
+ from airbyte_cdk.sources.file_based.stream import (
62
+ AbstractFileBasedStream,
63
+ DefaultFileBasedStream,
64
+ FileIdentitiesStream,
65
+ PermissionsFileBasedStream,
66
+ )
67
+ from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
68
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
69
+ AbstractConcurrentFileBasedCursor,
70
+ FileBasedConcurrentCursor,
71
+ FileBasedFinalStateCursor,
72
+ )
73
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
74
+ from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository
75
+ from airbyte_cdk.sources.streams import Stream
76
+ from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
77
+ from airbyte_cdk.utils.analytics_message import create_analytics_message
78
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
79
+
80
+ DEFAULT_CONCURRENCY = 100
81
+ MAX_CONCURRENCY = 100
82
+ INITIAL_N_PARTITIONS = MAX_CONCURRENCY // 2
83
+ IDENTITIES_STREAM = "identities"
84
+
85
+
86
+ class FileBasedSource(ConcurrentSourceAdapter, ABC):
87
+ # We make each source override the concurrency level to give control over when they are upgraded.
88
+ _concurrency_level = None
89
+
90
+ def __init__(
91
+ self,
92
+ stream_reader: AbstractFileBasedStreamReader,
93
+ spec_class: Type[AbstractFileBasedSpec],
94
+ catalog: Optional[ConfiguredAirbyteCatalog],
95
+ config: Optional[Mapping[str, Any]],
96
+ state: Optional[List[AirbyteStateMessage]],
97
+ availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None,
98
+ discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
99
+ parsers: Mapping[Type[Any], FileTypeParser] = default_parsers,
100
+ validation_policies: Mapping[
101
+ ValidationPolicy, AbstractSchemaValidationPolicy
102
+ ] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
103
+ cursor_cls: Type[
104
+ Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]
105
+ ] = FileBasedConcurrentCursor,
106
+ stream_permissions_reader: Optional[AbstractFileBasedStreamPermissionsReader] = None,
107
+ ):
108
+ self.stream_reader = stream_reader
109
+ self.stream_permissions_reader = stream_permissions_reader
110
+ self.spec_class = spec_class
111
+ self.config = config
112
+ self.catalog = catalog
113
+ self.state = state
114
+ self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(
115
+ stream_reader
116
+ )
117
+ self.discovery_policy = discovery_policy
118
+ self.parsers = parsers
119
+ self.validation_policies = validation_policies
120
+ self.stream_schemas = (
121
+ {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
122
+ )
123
+ self.cursor_cls = cursor_cls
124
+ self.logger = init_logger(f"airbyte.{self.name}")
125
+ self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
126
+ self._message_repository: Optional[MessageRepository] = None
127
+ concurrent_source = ConcurrentSource.create(
128
+ MAX_CONCURRENCY,
129
+ INITIAL_N_PARTITIONS,
130
+ self.logger,
131
+ self._slice_logger,
132
+ self.message_repository,
133
+ )
134
+ self._state = None
135
+ super().__init__(concurrent_source)
136
+
137
+ @property
138
+ def message_repository(self) -> MessageRepository:
139
+ if self._message_repository is None:
140
+ self._message_repository = InMemoryMessageRepository(
141
+ Level(AirbyteLogFormatter.level_mapping[self.logger.level])
142
+ )
143
+ return self._message_repository
144
+
145
+ def check_connection(
146
+ self, logger: logging.Logger, config: Mapping[str, Any]
147
+ ) -> Tuple[bool, Optional[Any]]:
148
+ """
149
+ Check that the source can be accessed using the user-provided configuration.
150
+
151
+ For each stream, verify that we can list and read files.
152
+
153
+ Returns (True, None) if the connection check is successful.
154
+
155
+ Otherwise, the "error" object should describe what went wrong.
156
+ """
157
+ try:
158
+ streams = self.streams(config)
159
+ except Exception as config_exception:
160
+ raise AirbyteTracedException(
161
+ internal_message="Please check the logged errors for more information.",
162
+ message=FileBasedSourceError.CONFIG_VALIDATION_ERROR.value,
163
+ exception=AirbyteTracedException(exception=config_exception),
164
+ failure_type=FailureType.config_error,
165
+ )
166
+ if len(streams) == 0:
167
+ return (
168
+ False,
169
+ f"No streams are available for source {self.name}. This is probably an issue with the connector. Please verify that your "
170
+ f"configuration provides permissions to list and read files from the source. Contact support if you are unable to "
171
+ f"resolve this issue.",
172
+ )
173
+
174
+ errors = []
175
+ tracebacks = []
176
+ for stream in streams:
177
+ if isinstance(stream, FileIdentitiesStream):
178
+ identity = next(iter(stream.load_identity_groups()))
179
+ if not identity:
180
+ errors.append(
181
+ "Unable to get identities for current configuration, please check your credentials"
182
+ )
183
+ continue
184
+ if not isinstance(stream, AbstractFileBasedStream):
185
+ raise ValueError(f"Stream {stream} is not a file-based stream.")
186
+ try:
187
+ parsed_config = self._get_parsed_config(config)
188
+ availability_method = (
189
+ stream.availability_strategy.check_availability
190
+ if use_file_transfer(parsed_config) or use_permissions_transfer(parsed_config)
191
+ else stream.availability_strategy.check_availability_and_parsability
192
+ )
193
+ (
194
+ stream_is_available,
195
+ reason,
196
+ ) = availability_method(stream, logger, self)
197
+ except AirbyteTracedException as ate:
198
+ errors.append(f"Unable to connect to stream {stream.name} - {ate.message}")
199
+ tracebacks.append(traceback.format_exc())
200
+ except Exception:
201
+ errors.append(f"Unable to connect to stream {stream.name}")
202
+ tracebacks.append(traceback.format_exc())
203
+ else:
204
+ if not stream_is_available and reason:
205
+ errors.append(reason)
206
+
207
+ if len(errors) == 1 and len(tracebacks) == 1:
208
+ raise AirbyteTracedException(
209
+ internal_message=tracebacks[0],
210
+ message=f"{errors[0]}",
211
+ failure_type=FailureType.config_error,
212
+ )
213
+ if len(errors) == 1 and len(tracebacks) == 0:
214
+ raise AirbyteTracedException(
215
+ message=f"{errors[0]}",
216
+ failure_type=FailureType.config_error,
217
+ )
218
+ elif len(errors) > 1:
219
+ raise AirbyteTracedException(
220
+ internal_message="\n".join(tracebacks),
221
+ message=f"{len(errors)} streams with errors: {', '.join(error for error in errors)}",
222
+ failure_type=FailureType.config_error,
223
+ )
224
+
225
+ return not bool(errors), (errors or None)
226
+
227
+ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
228
+ """
229
+ Return a list of this source's streams.
230
+ """
231
+
232
+ if self.catalog:
233
+ state_manager = ConnectorStateManager(state=self.state)
234
+ else:
235
+ # During `check` operations we don't have a catalog so cannot create a state manager.
236
+ # Since the state manager is only required for incremental syncs, this is fine.
237
+ state_manager = None
238
+
239
+ try:
240
+ parsed_config = self._get_parsed_config(config)
241
+ self.stream_reader.config = parsed_config
242
+ if self.stream_permissions_reader:
243
+ self.stream_permissions_reader.config = parsed_config
244
+ streams: List[Stream] = []
245
+ for stream_config in parsed_config.streams:
246
+ # Like state_manager, `catalog_stream` may be None during `check`
247
+ catalog_stream = self._get_stream_from_catalog(stream_config)
248
+ stream_state = (
249
+ state_manager.get_stream_state(catalog_stream.name, catalog_stream.namespace)
250
+ if (state_manager and catalog_stream)
251
+ else None
252
+ )
253
+ self._validate_input_schema(stream_config)
254
+
255
+ sync_mode = self._get_sync_mode_from_catalog(stream_config.name)
256
+
257
+ if (
258
+ sync_mode == SyncMode.full_refresh
259
+ and hasattr(self, "_concurrency_level")
260
+ and self._concurrency_level is not None
261
+ ):
262
+ cursor = FileBasedFinalStateCursor(
263
+ stream_config=stream_config,
264
+ stream_namespace=None,
265
+ message_repository=self.message_repository,
266
+ )
267
+ stream = FileBasedStreamFacade.create_from_stream(
268
+ stream=self._make_file_based_stream(
269
+ stream_config=stream_config,
270
+ cursor=cursor,
271
+ parsed_config=parsed_config,
272
+ ),
273
+ source=self,
274
+ logger=self.logger,
275
+ state=stream_state,
276
+ cursor=cursor,
277
+ )
278
+
279
+ elif (
280
+ sync_mode == SyncMode.incremental
281
+ and issubclass(self.cursor_cls, AbstractConcurrentFileBasedCursor)
282
+ and hasattr(self, "_concurrency_level")
283
+ and self._concurrency_level is not None
284
+ ):
285
+ assert (
286
+ state_manager is not None
287
+ ), "No ConnectorStateManager was created, but it is required for incremental syncs. This is unexpected. Please contact Support."
288
+
289
+ cursor = self.cursor_cls(
290
+ stream_config,
291
+ stream_config.name,
292
+ None,
293
+ stream_state,
294
+ self.message_repository,
295
+ state_manager,
296
+ CursorField(DefaultFileBasedStream.ab_last_mod_col),
297
+ )
298
+ stream = FileBasedStreamFacade.create_from_stream(
299
+ stream=self._make_file_based_stream(
300
+ stream_config=stream_config,
301
+ cursor=cursor,
302
+ parsed_config=parsed_config,
303
+ ),
304
+ source=self,
305
+ logger=self.logger,
306
+ state=stream_state,
307
+ cursor=cursor,
308
+ )
309
+ else:
310
+ cursor = self.cursor_cls(stream_config)
311
+ stream = self._make_file_based_stream(
312
+ stream_config=stream_config,
313
+ cursor=cursor,
314
+ parsed_config=parsed_config,
315
+ )
316
+
317
+ streams.append(stream)
318
+
319
+ if include_identities_stream(parsed_config):
320
+ identities_stream = self._make_identities_stream()
321
+ streams.append(identities_stream)
322
+ return streams
323
+
324
+ except ValidationError as exc:
325
+ raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) from exc
326
+
327
+ def _make_default_stream(
328
+ self,
329
+ stream_config: FileBasedStreamConfig,
330
+ cursor: Optional[AbstractFileBasedCursor],
331
+ parsed_config: AbstractFileBasedSpec,
332
+ ) -> AbstractFileBasedStream:
333
+ return DefaultFileBasedStream(
334
+ config=stream_config,
335
+ catalog_schema=self.stream_schemas.get(stream_config.name),
336
+ stream_reader=self.stream_reader,
337
+ availability_strategy=self.availability_strategy,
338
+ discovery_policy=self.discovery_policy,
339
+ parsers=self.parsers,
340
+ validation_policy=self._validate_and_get_validation_policy(stream_config),
341
+ errors_collector=self.errors_collector,
342
+ cursor=cursor,
343
+ use_file_transfer=use_file_transfer(parsed_config),
344
+ preserve_directory_structure=preserve_directory_structure(parsed_config),
345
+ )
346
+
347
+ def _ensure_permissions_reader_available(self) -> None:
348
+ """
349
+ Validates that a stream permissions reader is available.
350
+ Raises a ValueError if the reader is not provided.
351
+ """
352
+ if not self.stream_permissions_reader:
353
+ raise ValueError(
354
+ "Stream permissions reader is required for streams that use permissions transfer mode."
355
+ )
356
+
357
+ def _make_permissions_stream(
358
+ self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor]
359
+ ) -> AbstractFileBasedStream:
360
+ """
361
+ Creates a stream that reads permissions from files.
362
+ """
363
+ self._ensure_permissions_reader_available()
364
+ return PermissionsFileBasedStream(
365
+ config=stream_config,
366
+ catalog_schema=self.stream_schemas.get(stream_config.name),
367
+ stream_reader=self.stream_reader,
368
+ availability_strategy=self.availability_strategy,
369
+ discovery_policy=self.discovery_policy,
370
+ parsers=self.parsers,
371
+ validation_policy=self._validate_and_get_validation_policy(stream_config),
372
+ errors_collector=self.errors_collector,
373
+ cursor=cursor,
374
+ stream_permissions_reader=self.stream_permissions_reader, # type: ignore
375
+ )
376
+
377
+ def _make_file_based_stream(
378
+ self,
379
+ stream_config: FileBasedStreamConfig,
380
+ cursor: Optional[AbstractFileBasedCursor],
381
+ parsed_config: AbstractFileBasedSpec,
382
+ ) -> AbstractFileBasedStream:
383
+ """
384
+ Creates different streams depending on the type of the transfer mode selected
385
+ """
386
+ if use_permissions_transfer(parsed_config):
387
+ return self._make_permissions_stream(stream_config, cursor)
388
+ # we should have a stream for File transfer mode to decouple from DefaultFileBasedStream
389
+ else:
390
+ return self._make_default_stream(stream_config, cursor, parsed_config)
391
+
392
+ def _make_identities_stream(
393
+ self,
394
+ ) -> Stream:
395
+ self._ensure_permissions_reader_available()
396
+ return FileIdentitiesStream(
397
+ catalog_schema=self.stream_schemas.get(FileIdentitiesStream.IDENTITIES_STREAM_NAME),
398
+ stream_permissions_reader=self.stream_permissions_reader, # type: ignore
399
+ discovery_policy=self.discovery_policy,
400
+ errors_collector=self.errors_collector,
401
+ )
402
+
403
+ def _get_stream_from_catalog(
404
+ self, stream_config: FileBasedStreamConfig
405
+ ) -> Optional[AirbyteStream]:
406
+ if self.catalog:
407
+ for stream in self.catalog.streams or []:
408
+ if stream.stream.name == stream_config.name:
409
+ return stream.stream
410
+ return None
411
+
412
+ def _get_sync_mode_from_catalog(self, stream_name: str) -> Optional[SyncMode]:
413
+ if self.catalog:
414
+ for catalog_stream in self.catalog.streams:
415
+ if stream_name == catalog_stream.stream.name:
416
+ return catalog_stream.sync_mode
417
+ self.logger.warning(f"No sync mode was found for {stream_name}.")
418
+ return None
419
+
420
+ def read(
421
+ self,
422
+ logger: logging.Logger,
423
+ config: Mapping[str, Any],
424
+ catalog: ConfiguredAirbyteCatalog,
425
+ state: Optional[List[AirbyteStateMessage]] = None,
426
+ ) -> Iterator[AirbyteMessage]:
427
+ yield from super().read(logger, config, catalog, state)
428
+ # emit all the errors collected
429
+ yield from self.errors_collector.yield_and_raise_collected()
430
+ # count streams using a certain parser
431
+ parsed_config = self._get_parsed_config(config)
432
+ for parser, count in Counter(
433
+ stream.format.filetype for stream in parsed_config.streams
434
+ ).items():
435
+ yield create_analytics_message(f"file-cdk-{parser}-stream-count", count)
436
+
437
+ def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
438
+ """
439
+ Returns the specification describing what fields can be configured by a user when setting up a file-based source.
440
+ """
441
+
442
+ return ConnectorSpecification(
443
+ documentationUrl=self.spec_class.documentation_url(),
444
+ connectionSpecification=self.spec_class.schema(),
445
+ )
446
+
447
+ def _get_parsed_config(self, config: Mapping[str, Any]) -> AbstractFileBasedSpec:
448
+ return self.spec_class(**config)
449
+
450
+ def _validate_and_get_validation_policy(
451
+ self, stream_config: FileBasedStreamConfig
452
+ ) -> AbstractSchemaValidationPolicy:
453
+ if stream_config.validation_policy not in self.validation_policies:
454
+ # This should never happen because we validate the config against the schema's validation_policy enum
455
+ raise ValidationError(
456
+ f"`validation_policy` must be one of {list(self.validation_policies.keys())}",
457
+ model=FileBasedStreamConfig,
458
+ )
459
+ return self.validation_policies[stream_config.validation_policy]
460
+
461
+ def _validate_input_schema(self, stream_config: FileBasedStreamConfig) -> None:
462
+ if stream_config.schemaless and stream_config.input_schema:
463
+ raise ValidationError(
464
+ "`input_schema` and `schemaless` options cannot both be set",
465
+ model=FileBasedStreamConfig,
466
+ )