airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,313 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from datetime import datetime, timedelta
7
+ from threading import RLock
8
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, MutableMapping, Optional, Tuple
9
+
10
+ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type
11
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
12
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
13
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
14
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import (
15
+ AbstractConcurrentFileBasedCursor,
16
+ )
17
+ from airbyte_cdk.sources.file_based.stream.cursor import DefaultFileBasedCursor
18
+ from airbyte_cdk.sources.file_based.types import StreamState
19
+ from airbyte_cdk.sources.message.repository import MessageRepository
20
+ from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
21
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
22
+ from airbyte_cdk.sources.types import Record
23
+
24
+ if TYPE_CHECKING:
25
+ from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
26
+
27
+ _NULL_FILE = ""
28
+
29
+
30
+ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
31
+ CURSOR_FIELD = "_ab_source_file_last_modified"
32
+ DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = (
33
+ DefaultFileBasedCursor.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
34
+ )
35
+ DEFAULT_MAX_HISTORY_SIZE = 10_000
36
+ DATE_TIME_FORMAT = DefaultFileBasedCursor.DATE_TIME_FORMAT
37
+ zero_value = datetime.min
38
+ zero_cursor_value = f"0001-01-01T00:00:00.000000Z_{_NULL_FILE}"
39
+
40
+ def __init__(
41
+ self,
42
+ stream_config: FileBasedStreamConfig,
43
+ stream_name: str,
44
+ stream_namespace: Optional[str],
45
+ stream_state: MutableMapping[str, Any],
46
+ message_repository: MessageRepository,
47
+ connector_state_manager: ConnectorStateManager,
48
+ cursor_field: CursorField,
49
+ ) -> None:
50
+ super().__init__()
51
+ self._stream_name = stream_name
52
+ self._stream_namespace = stream_namespace
53
+ self._state = stream_state
54
+ self._message_repository = message_repository
55
+ self._connector_state_manager = connector_state_manager
56
+ self._cursor_field = cursor_field
57
+ self._time_window_if_history_is_full = timedelta(
58
+ days=stream_config.days_to_sync_if_history_is_full
59
+ or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
60
+ )
61
+ self._state_lock = RLock()
62
+ self._pending_files_lock = RLock()
63
+ self._pending_files: Optional[Dict[str, RemoteFile]] = None
64
+ self._file_to_datetime_history = stream_state.get("history", {}) if stream_state else {}
65
+ self._prev_cursor_value = self._compute_prev_sync_cursor(stream_state)
66
+ self._sync_start = self._compute_start_time()
67
+
68
+ @property
69
+ def state(self) -> MutableMapping[str, Any]:
70
+ return self._state
71
+
72
+ def observe(self, record: Record) -> None:
73
+ pass
74
+
75
+ def close_partition(self, partition: Partition) -> None:
76
+ with self._pending_files_lock:
77
+ if self._pending_files is None:
78
+ raise RuntimeError(
79
+ "Expected pending partitions to be set but it was not. This is unexpected. Please contact Support."
80
+ )
81
+
82
+ def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None:
83
+ with self._pending_files_lock:
84
+ self._pending_files = {}
85
+ for partition in partitions:
86
+ _slice = partition.to_slice()
87
+ if _slice is None:
88
+ continue
89
+ for file in _slice["files"]:
90
+ if file.uri in self._pending_files.keys():
91
+ raise RuntimeError(
92
+ f"Already found file {_slice} in pending files. This is unexpected. Please contact Support."
93
+ )
94
+ self._pending_files.update({file.uri: file})
95
+
96
+ def _compute_prev_sync_cursor(self, value: Optional[StreamState]) -> Tuple[datetime, str]:
97
+ if not value:
98
+ return self.zero_value, ""
99
+ prev_cursor_str = value.get(self._cursor_field.cursor_field_key) or self.zero_cursor_value
100
+ # So if we see a cursor greater than the earliest file, it means that we have likely synced all files.
101
+ # However, we take the earliest file as the cursor value for the purpose of checking which files to
102
+ # sync, in case new files have been uploaded in the meantime.
103
+ # This should be very rare, as it would indicate a race condition where a file with an earlier
104
+ # last_modified time was uploaded after a file with a later last_modified time. Since last_modified
105
+ # represents the start time that the file was uploaded, we can usually expect that all previous
106
+ # files have already been uploaded. If that's the case, they'll be in history and we'll skip
107
+ # re-uploading them.
108
+ earliest_file_cursor_value = self._get_cursor_key_from_file(
109
+ self._compute_earliest_file_in_history()
110
+ )
111
+ cursor_str = min(prev_cursor_str, earliest_file_cursor_value)
112
+ cursor_dt, cursor_uri = cursor_str.split("_", 1)
113
+ return datetime.strptime(cursor_dt, self.DATE_TIME_FORMAT), cursor_uri
114
+
115
+ def _get_cursor_key_from_file(self, file: Optional[RemoteFile]) -> str:
116
+ if file:
117
+ return f"{datetime.strftime(file.last_modified, self.DATE_TIME_FORMAT)}_{file.uri}"
118
+ return self.zero_cursor_value
119
+
120
+ def _compute_earliest_file_in_history(self) -> Optional[RemoteFile]:
121
+ with self._state_lock:
122
+ if self._file_to_datetime_history:
123
+ filename, last_modified = min(
124
+ self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])
125
+ )
126
+ return RemoteFile(
127
+ uri=filename,
128
+ last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT),
129
+ )
130
+ else:
131
+ return None
132
+
133
+ def add_file(self, file: RemoteFile) -> None:
134
+ """
135
+ Add a file to the cursor. This method is called when a file is processed by the stream.
136
+ :param file: The file to add
137
+ """
138
+ if self._pending_files is None:
139
+ raise RuntimeError(
140
+ "Expected pending partitions to be set but it was not. This is unexpected. Please contact Support."
141
+ )
142
+ with self._pending_files_lock:
143
+ with self._state_lock:
144
+ if file.uri not in self._pending_files:
145
+ self._message_repository.emit_message(
146
+ AirbyteMessage(
147
+ type=Type.LOG,
148
+ log=AirbyteLogMessage(
149
+ level=Level.WARN,
150
+ message=f"The file {file.uri} was not found in the list of pending files. This is unexpected. Please contact Support",
151
+ ),
152
+ )
153
+ )
154
+ else:
155
+ self._pending_files.pop(file.uri)
156
+ self._file_to_datetime_history[file.uri] = file.last_modified.strftime(
157
+ self.DATE_TIME_FORMAT
158
+ )
159
+ if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE:
160
+ # Get the earliest file based on its last modified date and its uri
161
+ oldest_file = self._compute_earliest_file_in_history()
162
+ if oldest_file:
163
+ del self._file_to_datetime_history[oldest_file.uri]
164
+ else:
165
+ raise Exception(
166
+ "The history is full but there is no files in the history. This should never happen and might be indicative of a bug in the CDK."
167
+ )
168
+ self.emit_state_message()
169
+
170
+ def emit_state_message(self) -> None:
171
+ with self._state_lock:
172
+ new_state = self.get_state()
173
+ self._connector_state_manager.update_state_for_stream(
174
+ self._stream_name,
175
+ self._stream_namespace,
176
+ new_state,
177
+ )
178
+ state_message = self._connector_state_manager.create_state_message(
179
+ self._stream_name, self._stream_namespace
180
+ )
181
+ self._message_repository.emit_message(state_message)
182
+
183
+ def _get_new_cursor_value(self) -> str:
184
+ with self._pending_files_lock:
185
+ with self._state_lock:
186
+ if self._pending_files:
187
+ # If there are partitions that haven't been synced, we don't know whether the files that have been synced
188
+ # represent a contiguous region.
189
+ # To avoid missing files, we only increment the cursor up to the oldest pending file, because we know
190
+ # that all older files have been synced.
191
+ return self._get_cursor_key_from_file(self._compute_earliest_pending_file())
192
+ elif self._file_to_datetime_history:
193
+ # If all partitions have been synced, we know that the sync is up-to-date and so can advance
194
+ # the cursor to the newest file in history.
195
+ return self._get_cursor_key_from_file(self._compute_latest_file_in_history())
196
+ else:
197
+ return f"{self.zero_value.strftime(self.DATE_TIME_FORMAT)}_"
198
+
199
+ def _compute_earliest_pending_file(self) -> Optional[RemoteFile]:
200
+ if self._pending_files:
201
+ return min(self._pending_files.values(), key=lambda x: x.last_modified)
202
+ else:
203
+ return None
204
+
205
+ def _compute_latest_file_in_history(self) -> Optional[RemoteFile]:
206
+ with self._state_lock:
207
+ if self._file_to_datetime_history:
208
+ filename, last_modified = max(
209
+ self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])
210
+ )
211
+ return RemoteFile(
212
+ uri=filename,
213
+ last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT),
214
+ )
215
+ else:
216
+ return None
217
+
218
+ def get_files_to_sync(
219
+ self, all_files: Iterable[RemoteFile], logger: logging.Logger
220
+ ) -> Iterable[RemoteFile]:
221
+ """
222
+ Given the list of files in the source, return the files that should be synced.
223
+ :param all_files: All files in the source
224
+ :param logger:
225
+ :return: The files that should be synced
226
+ """
227
+ with self._state_lock:
228
+ if self._is_history_full():
229
+ logger.warning(
230
+ f"The state history is full. "
231
+ f"This sync and future syncs won't be able to use the history to filter out duplicate files. "
232
+ f"It will instead use the time window of {self._time_window_if_history_is_full} to filter out files."
233
+ )
234
+ for f in all_files:
235
+ if self._should_sync_file(f, logger):
236
+ yield f
237
+
238
+ def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool:
239
+ with self._state_lock:
240
+ if file.uri in self._file_to_datetime_history:
241
+ # If the file's uri is in the history, we should sync the file if it has been modified since it was synced
242
+ updated_at_from_history = datetime.strptime(
243
+ self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT
244
+ )
245
+ if file.last_modified < updated_at_from_history:
246
+ self._message_repository.emit_message(
247
+ AirbyteMessage(
248
+ type=Type.LOG,
249
+ log=AirbyteLogMessage(
250
+ level=Level.WARN,
251
+ message=f"The file {file.uri}'s last modified date is older than the last time it was synced. This is unexpected. Skipping the file.",
252
+ ),
253
+ )
254
+ )
255
+ return False
256
+ else:
257
+ return file.last_modified > updated_at_from_history
258
+
259
+ prev_cursor_timestamp, prev_cursor_uri = self._prev_cursor_value
260
+ if self._is_history_full():
261
+ if file.last_modified > prev_cursor_timestamp:
262
+ # If the history is partial and the file's datetime is strictly greater than the cursor, we should sync it
263
+ return True
264
+ elif file.last_modified == prev_cursor_timestamp:
265
+ # If the history is partial and the file's datetime is equal to the earliest file in the history,
266
+ # we should sync it if its uri is greater than or equal to the cursor value.
267
+ return file.uri > prev_cursor_uri
268
+ else:
269
+ return file.last_modified >= self._sync_start
270
+ else:
271
+ # The file is not in the history and the history is complete. We know we need to sync the file
272
+ return True
273
+
274
+ def _is_history_full(self) -> bool:
275
+ """
276
+ Returns true if the state's history is full, meaning new entries will start to replace old entries.
277
+ """
278
+ with self._state_lock:
279
+ if self._file_to_datetime_history is None:
280
+ raise RuntimeError(
281
+ "The history object has not been set. This is unexpected. Please contact Support."
282
+ )
283
+ return len(self._file_to_datetime_history) >= self.DEFAULT_MAX_HISTORY_SIZE
284
+
285
+ def _compute_start_time(self) -> datetime:
286
+ if not self._file_to_datetime_history:
287
+ return datetime.min
288
+ else:
289
+ earliest = min(self._file_to_datetime_history.values())
290
+ earliest_dt = datetime.strptime(earliest, self.DATE_TIME_FORMAT)
291
+ if self._is_history_full():
292
+ time_window = datetime.now() - self._time_window_if_history_is_full
293
+ earliest_dt = min(earliest_dt, time_window)
294
+ return earliest_dt
295
+
296
+ def get_start_time(self) -> datetime:
297
+ return self._sync_start
298
+
299
+ def get_state(self) -> MutableMapping[str, Any]:
300
+ """
301
+ Get the state of the cursor.
302
+ """
303
+ with self._state_lock:
304
+ return {
305
+ "history": self._file_to_datetime_history,
306
+ self._cursor_field.cursor_field_key: self._get_new_cursor_value(),
307
+ }
308
+
309
+ def set_initial_state(self, value: StreamState) -> None:
310
+ pass
311
+
312
+ def ensure_at_least_one_state_emitted(self) -> None:
313
+ self.emit_state_message()
@@ -0,0 +1,83 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from datetime import datetime
7
+ from typing import TYPE_CHECKING, Any, Iterable, List, MutableMapping, Optional
8
+
9
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
10
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
11
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
12
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import (
13
+ AbstractConcurrentFileBasedCursor,
14
+ )
15
+ from airbyte_cdk.sources.file_based.types import StreamState
16
+ from airbyte_cdk.sources.message import MessageRepository
17
+ from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY
18
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
19
+ from airbyte_cdk.sources.types import Record
20
+
21
+ if TYPE_CHECKING:
22
+ from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
23
+
24
+
25
+ class FileBasedFinalStateCursor(AbstractConcurrentFileBasedCursor):
26
+ """Cursor that is used to guarantee at least one state message is emitted for a concurrent file-based stream."""
27
+
28
+ def __init__(
29
+ self,
30
+ stream_config: FileBasedStreamConfig,
31
+ message_repository: MessageRepository,
32
+ stream_namespace: Optional[str],
33
+ **kwargs: Any,
34
+ ):
35
+ self._stream_name = stream_config.name
36
+ self._stream_namespace = stream_namespace
37
+ self._message_repository = message_repository
38
+ # Normally the connector state manager operates at the source-level. However, we only need it to write the sentinel
39
+ # state message rather than manage overall source state. This is also only temporary as we move to the resumable
40
+ # full refresh world where every stream uses a FileBasedConcurrentCursor with incremental state.
41
+ self._connector_state_manager = ConnectorStateManager()
42
+
43
+ @property
44
+ def state(self) -> MutableMapping[str, Any]:
45
+ return {NO_CURSOR_STATE_KEY: True}
46
+
47
+ def observe(self, record: Record) -> None:
48
+ pass
49
+
50
+ def close_partition(self, partition: Partition) -> None:
51
+ pass
52
+
53
+ def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None:
54
+ pass
55
+
56
+ def add_file(self, file: RemoteFile) -> None:
57
+ pass
58
+
59
+ def get_files_to_sync(
60
+ self, all_files: Iterable[RemoteFile], logger: logging.Logger
61
+ ) -> Iterable[RemoteFile]:
62
+ return all_files
63
+
64
+ def get_state(self) -> MutableMapping[str, Any]:
65
+ return {}
66
+
67
+ def set_initial_state(self, value: StreamState) -> None:
68
+ return None
69
+
70
+ def get_start_time(self) -> datetime:
71
+ return datetime.min
72
+
73
+ def emit_state_message(self) -> None:
74
+ pass
75
+
76
+ def ensure_at_least_one_state_emitted(self) -> None:
77
+ self._connector_state_manager.update_state_for_stream(
78
+ self._stream_name, self._stream_namespace, self.state
79
+ )
80
+ state_message = self._connector_state_manager.create_state_message(
81
+ self._stream_name, self._stream_namespace
82
+ )
83
+ self._message_repository.emit_message(state_message)
@@ -0,0 +1,4 @@
1
+ from .abstract_file_based_cursor import AbstractFileBasedCursor
2
+ from .default_file_based_cursor import DefaultFileBasedCursor
3
+
4
+ __all__ = ["AbstractFileBasedCursor", "DefaultFileBasedCursor"]
@@ -0,0 +1,66 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from abc import ABC, abstractmethod
7
+ from datetime import datetime
8
+ from typing import Any, Iterable, MutableMapping
9
+
10
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
11
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
12
+ from airbyte_cdk.sources.file_based.types import StreamState
13
+
14
+
15
+ class AbstractFileBasedCursor(ABC):
16
+ """
17
+ Abstract base class for cursors used by file-based streams.
18
+ """
19
+
20
+ @abstractmethod
21
+ def __init__(self, stream_config: FileBasedStreamConfig, **kwargs: Any):
22
+ """
23
+ Common interface for all cursors.
24
+ """
25
+ ...
26
+
27
+ @abstractmethod
28
+ def add_file(self, file: RemoteFile) -> None:
29
+ """
30
+ Add a file to the cursor. This method is called when a file is processed by the stream.
31
+ :param file: The file to add
32
+ """
33
+ ...
34
+
35
+ @abstractmethod
36
+ def set_initial_state(self, value: StreamState) -> None:
37
+ """
38
+ Set the initial state of the cursor. The cursor cannot be initialized at construction time because the stream doesn't know its state yet.
39
+ :param value: The stream state
40
+ """
41
+
42
+ @abstractmethod
43
+ def get_state(self) -> MutableMapping[str, Any]:
44
+ """
45
+ Get the state of the cursor.
46
+ """
47
+ ...
48
+
49
+ @abstractmethod
50
+ def get_start_time(self) -> datetime:
51
+ """
52
+ Returns the start time of the current sync.
53
+ """
54
+ ...
55
+
56
+ @abstractmethod
57
+ def get_files_to_sync(
58
+ self, all_files: Iterable[RemoteFile], logger: logging.Logger
59
+ ) -> Iterable[RemoteFile]:
60
+ """
61
+ Given the list of files in the source, return the files that should be synced.
62
+ :param all_files: All files in the source
63
+ :param logger:
64
+ :return: The files that should be synced
65
+ """
66
+ ...
@@ -0,0 +1,149 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from datetime import datetime, timedelta
7
+ from typing import Any, Iterable, MutableMapping, Optional
8
+
9
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
+ from airbyte_cdk.sources.file_based.stream.cursor.abstract_file_based_cursor import (
12
+ AbstractFileBasedCursor,
13
+ )
14
+ from airbyte_cdk.sources.file_based.types import StreamState
15
+
16
+
17
+ class DefaultFileBasedCursor(AbstractFileBasedCursor):
18
+ DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3
19
+ DEFAULT_MAX_HISTORY_SIZE = 10_000
20
+ DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
21
+ CURSOR_FIELD = "_ab_source_file_last_modified"
22
+
23
+ def __init__(self, stream_config: FileBasedStreamConfig, **_: Any):
24
+ super().__init__(stream_config) # type: ignore [safe-super]
25
+ self._file_to_datetime_history: MutableMapping[str, str] = {}
26
+ self._time_window_if_history_is_full = timedelta(
27
+ days=stream_config.days_to_sync_if_history_is_full
28
+ or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
29
+ )
30
+
31
+ if self._time_window_if_history_is_full <= timedelta():
32
+ raise ValueError(
33
+ f"days_to_sync_if_history_is_full must be a positive timedelta, got {self._time_window_if_history_is_full}"
34
+ )
35
+
36
+ self._start_time = self._compute_start_time()
37
+ self._initial_earliest_file_in_history: Optional[RemoteFile] = None
38
+
39
+ def set_initial_state(self, value: StreamState) -> None:
40
+ self._file_to_datetime_history = value.get("history", {})
41
+ self._start_time = self._compute_start_time()
42
+ self._initial_earliest_file_in_history = self._compute_earliest_file_in_history()
43
+
44
+ def add_file(self, file: RemoteFile) -> None:
45
+ self._file_to_datetime_history[file.uri] = file.last_modified.strftime(
46
+ self.DATE_TIME_FORMAT
47
+ )
48
+ if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE:
49
+ # Get the earliest file based on its last modified date and its uri
50
+ oldest_file = self._compute_earliest_file_in_history()
51
+ if oldest_file:
52
+ del self._file_to_datetime_history[oldest_file.uri]
53
+ else:
54
+ raise Exception(
55
+ "The history is full but there is no files in the history. This should never happen and might be indicative of a bug in the CDK."
56
+ )
57
+
58
+ def get_state(self) -> StreamState:
59
+ state = {"history": self._file_to_datetime_history, self.CURSOR_FIELD: self._get_cursor()}
60
+ return state
61
+
62
+ def _get_cursor(self) -> Optional[str]:
63
+ """
64
+ Returns the cursor value.
65
+
66
+ Files are synced in order of last-modified with secondary sort on filename, so the cursor value is
67
+ a string joining the last-modified timestamp of the last synced file and the name of the file.
68
+ """
69
+ if self._file_to_datetime_history.items():
70
+ filename, timestamp = max(
71
+ self._file_to_datetime_history.items(), key=lambda x: (x[1], x[0])
72
+ )
73
+ return f"{timestamp}_{filename}"
74
+ return None
75
+
76
+ def _is_history_full(self) -> bool:
77
+ """
78
+ Returns true if the state's history is full, meaning new entries will start to replace old entries.
79
+ """
80
+ return len(self._file_to_datetime_history) >= self.DEFAULT_MAX_HISTORY_SIZE
81
+
82
+ def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool:
83
+ if file.uri in self._file_to_datetime_history:
84
+ # If the file's uri is in the history, we should sync the file if it has been modified since it was synced
85
+ updated_at_from_history = datetime.strptime(
86
+ self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT
87
+ )
88
+ if file.last_modified < updated_at_from_history:
89
+ logger.warning(
90
+ f"The file {file.uri}'s last modified date is older than the last time it was synced. This is unexpected. Skipping the file."
91
+ )
92
+ else:
93
+ return file.last_modified > updated_at_from_history
94
+ return file.last_modified > updated_at_from_history
95
+ if self._is_history_full():
96
+ if self._initial_earliest_file_in_history is None:
97
+ return True
98
+ if file.last_modified > self._initial_earliest_file_in_history.last_modified:
99
+ # If the history is partial and the file's datetime is strictly greater than the earliest file in the history,
100
+ # we should sync it
101
+ return True
102
+ elif file.last_modified == self._initial_earliest_file_in_history.last_modified:
103
+ # If the history is partial and the file's datetime is equal to the earliest file in the history,
104
+ # we should sync it if its uri is strictly greater than the earliest file in the history
105
+ return file.uri > self._initial_earliest_file_in_history.uri
106
+ else:
107
+ # Otherwise, only sync the file if it has been modified since the start of the time window
108
+ return file.last_modified >= self.get_start_time()
109
+ else:
110
+ # The file is not in the history and the history is complete. We know we need to sync the file
111
+ return True
112
+
113
+ def get_files_to_sync(
114
+ self, all_files: Iterable[RemoteFile], logger: logging.Logger
115
+ ) -> Iterable[RemoteFile]:
116
+ if self._is_history_full():
117
+ logger.warning(
118
+ f"The state history is full. "
119
+ f"This sync and future syncs won't be able to use the history to filter out duplicate files. "
120
+ f"It will instead use the time window of {self._time_window_if_history_is_full} to filter out files."
121
+ )
122
+ for f in all_files:
123
+ if self._should_sync_file(f, logger):
124
+ yield f
125
+
126
+ def get_start_time(self) -> datetime:
127
+ return self._start_time
128
+
129
+ def _compute_earliest_file_in_history(self) -> Optional[RemoteFile]:
130
+ if self._file_to_datetime_history:
131
+ filename, last_modified = min(
132
+ self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])
133
+ )
134
+ return RemoteFile(
135
+ uri=filename, last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT)
136
+ )
137
+ else:
138
+ return None
139
+
140
+ def _compute_start_time(self) -> datetime:
141
+ if not self._file_to_datetime_history:
142
+ return datetime.min
143
+ else:
144
+ earliest = min(self._file_to_datetime_history.values())
145
+ earliest_dt = datetime.strptime(earliest, self.DATE_TIME_FORMAT)
146
+ if self._is_history_full():
147
+ time_window = datetime.now() - self._time_window_if_history_is_full
148
+ earliest_dt = min(earliest_dt, time_window)
149
+ return earliest_dt