airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,210 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import codecs
6
+ from enum import Enum
7
+ from typing import Any, Dict, List, Optional, Set, Union
8
+
9
+ from pydantic.v1 import BaseModel, Field, root_validator, validator
10
+ from pydantic.v1.error_wrappers import ValidationError
11
+
12
+ from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
13
+
14
+
15
+ class InferenceType(Enum):
16
+ NONE = "None"
17
+ PRIMITIVE_TYPES_ONLY = "Primitive Types Only"
18
+
19
+
20
+ class CsvHeaderDefinitionType(Enum):
21
+ FROM_CSV = "From CSV"
22
+ AUTOGENERATED = "Autogenerated"
23
+ USER_PROVIDED = "User Provided"
24
+
25
+
26
+ class CsvHeaderFromCsv(BaseModel):
27
+ class Config(OneOfOptionConfig):
28
+ title = "From CSV"
29
+ discriminator = "header_definition_type"
30
+
31
+ header_definition_type: str = Field(
32
+ CsvHeaderDefinitionType.FROM_CSV.value,
33
+ const=True,
34
+ )
35
+
36
+ def has_header_row(self) -> bool:
37
+ return True
38
+
39
+
40
+ class CsvHeaderAutogenerated(BaseModel):
41
+ class Config(OneOfOptionConfig):
42
+ title = "Autogenerated"
43
+ discriminator = "header_definition_type"
44
+
45
+ header_definition_type: str = Field(
46
+ CsvHeaderDefinitionType.AUTOGENERATED.value,
47
+ const=True,
48
+ )
49
+
50
+ def has_header_row(self) -> bool:
51
+ return False
52
+
53
+
54
+ class CsvHeaderUserProvided(BaseModel):
55
+ class Config(OneOfOptionConfig):
56
+ title = "User Provided"
57
+ discriminator = "header_definition_type"
58
+
59
+ header_definition_type: str = Field(
60
+ CsvHeaderDefinitionType.USER_PROVIDED.value,
61
+ const=True,
62
+ )
63
+ column_names: List[str] = Field(
64
+ title="Column Names",
65
+ description="The column names that will be used while emitting the CSV records",
66
+ )
67
+
68
+ def has_header_row(self) -> bool:
69
+ return False
70
+
71
+ @validator("column_names")
72
+ def validate_column_names(cls, v: List[str]) -> List[str]:
73
+ if not v:
74
+ raise ValueError(
75
+ "At least one column name needs to be provided when using user provided headers"
76
+ )
77
+ return v
78
+
79
+
80
+ DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"]
81
+ DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
82
+
83
+
84
+ class CsvFormat(BaseModel):
85
+ class Config(OneOfOptionConfig):
86
+ title = "CSV Format"
87
+ discriminator = "filetype"
88
+
89
+ filetype: str = Field(
90
+ "csv",
91
+ const=True,
92
+ )
93
+ delimiter: str = Field(
94
+ title="Delimiter",
95
+ description="The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.",
96
+ default=",",
97
+ )
98
+ quote_char: str = Field(
99
+ title="Quote Character",
100
+ default='"',
101
+ description="The character used for quoting CSV values. To disallow quoting, make this field blank.",
102
+ )
103
+ escape_char: Optional[str] = Field(
104
+ title="Escape Character",
105
+ default=None,
106
+ description="The character used for escaping special characters. To disallow escaping, leave this field blank.",
107
+ )
108
+ encoding: Optional[str] = Field(
109
+ default="utf8",
110
+ description='The character encoding of the CSV data. Leave blank to default to <strong>UTF8</strong>. See <a href="https://docs.python.org/3/library/codecs.html#standard-encodings" target="_blank">list of python encodings</a> for allowable options.',
111
+ )
112
+ double_quote: bool = Field(
113
+ title="Double Quote",
114
+ default=True,
115
+ description="Whether two quotes in a quoted CSV value denote a single quote in the data.",
116
+ )
117
+ null_values: Set[str] = Field(
118
+ title="Null Values",
119
+ default=[],
120
+ description="A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.",
121
+ )
122
+ strings_can_be_null: bool = Field(
123
+ title="Strings Can Be Null",
124
+ default=True,
125
+ description="Whether strings can be interpreted as null values. If true, strings that match the null_values set will be interpreted as null. If false, strings that match the null_values set will be interpreted as the string itself.",
126
+ )
127
+ skip_rows_before_header: int = Field(
128
+ title="Skip Rows Before Header",
129
+ default=0,
130
+ description="The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.",
131
+ )
132
+ skip_rows_after_header: int = Field(
133
+ title="Skip Rows After Header",
134
+ default=0,
135
+ description="The number of rows to skip after the header row.",
136
+ )
137
+ header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = (
138
+ Field(
139
+ title="CSV Header Definition",
140
+ default=CsvHeaderFromCsv(header_definition_type=CsvHeaderDefinitionType.FROM_CSV.value),
141
+ description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
142
+ )
143
+ )
144
+ true_values: Set[str] = Field(
145
+ title="True Values",
146
+ default=DEFAULT_TRUE_VALUES,
147
+ description="A set of case-sensitive strings that should be interpreted as true values.",
148
+ )
149
+ false_values: Set[str] = Field(
150
+ title="False Values",
151
+ default=DEFAULT_FALSE_VALUES,
152
+ description="A set of case-sensitive strings that should be interpreted as false values.",
153
+ )
154
+ inference_type: InferenceType = Field(
155
+ title="Inference Type",
156
+ default=InferenceType.NONE,
157
+ description="How to infer the types of the columns. If none, inference default to strings.",
158
+ airbyte_hidden=True,
159
+ )
160
+ ignore_errors_on_fields_mismatch: bool = Field(
161
+ title="Ignore errors on field mismatch",
162
+ default=False,
163
+ description="Whether to ignore errors that occur when the number of fields in the CSV does not match the number of columns in the schema.",
164
+ )
165
+
166
+ @validator("delimiter")
167
+ def validate_delimiter(cls, v: str) -> str:
168
+ if v == r"\t":
169
+ v = "\t"
170
+ if len(v) != 1:
171
+ raise ValueError("delimiter should only be one character")
172
+ if v in {"\r", "\n"}:
173
+ raise ValueError(f"delimiter cannot be {v}")
174
+ return v
175
+
176
+ @validator("quote_char")
177
+ def validate_quote_char(cls, v: str) -> str:
178
+ if len(v) != 1:
179
+ raise ValueError("quote_char should only be one character")
180
+ return v
181
+
182
+ @validator("escape_char")
183
+ def validate_escape_char(cls, v: str) -> str:
184
+ if v is not None and len(v) != 1:
185
+ raise ValueError("escape_char should only be one character")
186
+ return v
187
+
188
+ @validator("encoding")
189
+ def validate_encoding(cls, v: str) -> str:
190
+ try:
191
+ codecs.lookup(v)
192
+ except LookupError:
193
+ raise ValueError(f"invalid encoding format: {v}")
194
+ return v
195
+
196
+ @root_validator
197
+ def validate_optional_args(cls, values: Dict[str, Any]) -> Dict[str, Any]:
198
+ definition_type = values.get("header_definition_type")
199
+ column_names = values.get("user_provided_column_names")
200
+ if definition_type == CsvHeaderDefinitionType.USER_PROVIDED and not column_names:
201
+ raise ValidationError(
202
+ "`user_provided_column_names` should be defined if the definition 'User Provided'.",
203
+ model=CsvFormat,
204
+ )
205
+ if definition_type != CsvHeaderDefinitionType.USER_PROVIDED and column_names:
206
+ raise ValidationError(
207
+ "`user_provided_column_names` should not be defined if the definition is not 'User Provided'.",
208
+ model=CsvFormat,
209
+ )
210
+ return values
@@ -0,0 +1,18 @@
1
+ #
2
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from pydantic.v1 import BaseModel, Field
6
+
7
+ from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
8
+
9
+
10
+ class ExcelFormat(BaseModel):
11
+ class Config(OneOfOptionConfig):
12
+ title = "Excel Format"
13
+ discriminator = "filetype"
14
+
15
+ filetype: str = Field(
16
+ "excel",
17
+ const=True,
18
+ )
@@ -0,0 +1,99 @@
1
+ #
2
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from enum import Enum
6
+ from typing import Any, List, Mapping, Optional, Union
7
+
8
+ from pydantic.v1 import BaseModel, Field, validator
9
+
10
+ from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
11
+ from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
12
+ from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
13
+ from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
14
+ from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
15
+ from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
16
+ from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
17
+ from airbyte_cdk.sources.file_based.schema_helpers import type_mapping_to_jsonschema
18
+
19
+ PrimaryKeyType = Optional[Union[str, List[str]]]
20
+
21
+
22
+ class ValidationPolicy(Enum):
23
+ emit_record = "Emit Record"
24
+ skip_record = "Skip Record"
25
+ wait_for_discover = "Wait for Discover"
26
+
27
+
28
+ class FileBasedStreamConfig(BaseModel):
29
+ name: str = Field(title="Name", description="The name of the stream.")
30
+ globs: Optional[List[str]] = Field(
31
+ default=["**"],
32
+ title="Globs",
33
+ description='The pattern used to specify which files should be selected from the file system. For more information on glob pattern matching look <a href="https://en.wikipedia.org/wiki/Glob_(programming)">here</a>.',
34
+ order=1,
35
+ )
36
+ legacy_prefix: Optional[str] = Field(
37
+ title="Legacy Prefix",
38
+ description="The path prefix configured in v3 versions of the S3 connector. This option is deprecated in favor of a single glob.",
39
+ airbyte_hidden=True,
40
+ )
41
+ validation_policy: ValidationPolicy = Field(
42
+ title="Validation Policy",
43
+ description="The name of the validation policy that dictates sync behavior when a record does not adhere to the stream schema.",
44
+ default=ValidationPolicy.emit_record,
45
+ )
46
+ input_schema: Optional[str] = Field(
47
+ title="Input Schema",
48
+ description="The schema that will be used to validate records extracted from the file. This will override the stream schema that is auto-detected from incoming files.",
49
+ )
50
+ primary_key: Optional[str] = Field(
51
+ title="Primary Key",
52
+ description="The column or columns (for a composite key) that serves as the unique identifier of a record. If empty, the primary key will default to the parser's default primary key.",
53
+ airbyte_hidden=True, # Users can create/modify primary keys in the connection configuration so we shouldn't duplicate it here.
54
+ )
55
+ days_to_sync_if_history_is_full: int = Field(
56
+ title="Days To Sync If History Is Full",
57
+ description="When the state history of the file store is full, syncs will only read files that were last modified in the provided day range.",
58
+ default=3,
59
+ )
60
+ format: Union[
61
+ AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat, ExcelFormat
62
+ ] = Field(
63
+ title="Format",
64
+ description="The configuration options that are used to alter how to read incoming files that deviate from the standard formatting.",
65
+ )
66
+ schemaless: bool = Field(
67
+ title="Schemaless",
68
+ description="When enabled, syncs will not validate or structure records against the stream's schema.",
69
+ default=False,
70
+ )
71
+ recent_n_files_to_read_for_schema_discovery: Optional[int] = Field(
72
+ title="Files To Read For Schema Discover",
73
+ description="The number of resent files which will be used to discover the schema for this stream.",
74
+ default=None,
75
+ gt=0,
76
+ )
77
+
78
+ @validator("input_schema", pre=True)
79
+ def validate_input_schema(cls, v: Optional[str]) -> Optional[str]:
80
+ if v:
81
+ if type_mapping_to_jsonschema(v):
82
+ return v
83
+ else:
84
+ raise ConfigValidationError(FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA)
85
+ return None
86
+
87
+ def get_input_schema(self) -> Optional[Mapping[str, Any]]:
88
+ """
89
+ User defined input_schema is defined as a string in the config. This method takes the string representation
90
+ and converts it into a Mapping[str, Any] which is used by file-based CDK components.
91
+ """
92
+ if self.input_schema:
93
+ schema = type_mapping_to_jsonschema(self.input_schema)
94
+ if not schema:
95
+ raise ValueError(
96
+ f"Unable to create JSON schema from input schema {self.input_schema}"
97
+ )
98
+ return schema
99
+ return None
@@ -0,0 +1,18 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from pydantic.v1 import BaseModel, Field
6
+
7
+ from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
8
+
9
+
10
+ class JsonlFormat(BaseModel):
11
+ class Config(OneOfOptionConfig):
12
+ title = "Jsonl Format"
13
+ discriminator = "filetype"
14
+
15
+ filetype: str = Field(
16
+ "jsonl",
17
+ const=True,
18
+ )
@@ -0,0 +1,25 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+
6
+ from pydantic.v1 import BaseModel, Field
7
+
8
+ from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
9
+
10
+
11
+ class ParquetFormat(BaseModel):
12
+ class Config(OneOfOptionConfig):
13
+ title = "Parquet Format"
14
+ discriminator = "filetype"
15
+
16
+ filetype: str = Field(
17
+ "parquet",
18
+ const=True,
19
+ )
20
+ # This option is not recommended, but necessary for backwards compatibility
21
+ decimal_as_float: bool = Field(
22
+ title="Convert Decimal Fields to Floats",
23
+ description="Whether to convert decimal fields to floats. There is a loss of precision when converting decimals to floats, so this is not recommended.",
24
+ default=False,
25
+ )
@@ -0,0 +1,102 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from typing import List, Literal, Optional, Union
6
+
7
+ from pydantic.v1 import BaseModel, Field
8
+
9
+ from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
10
+
11
+
12
+ class LocalProcessingConfigModel(BaseModel):
13
+ mode: Literal["local"] = Field("local", const=True)
14
+
15
+ class Config(OneOfOptionConfig):
16
+ title = "Local"
17
+ description = (
18
+ "Process files locally, supporting `fast` and `ocr` modes. This is the default option."
19
+ )
20
+ discriminator = "mode"
21
+
22
+
23
+ class APIParameterConfigModel(BaseModel):
24
+ name: str = Field(
25
+ title="Parameter name",
26
+ description="The name of the unstructured API parameter to use",
27
+ examples=["combine_under_n_chars", "languages"],
28
+ )
29
+ value: str = Field(
30
+ title="Value", description="The value of the parameter", examples=["true", "hi_res"]
31
+ )
32
+
33
+
34
+ class APIProcessingConfigModel(BaseModel):
35
+ mode: Literal["api"] = Field("api", const=True)
36
+
37
+ api_key: str = Field(
38
+ default="",
39
+ always_show=True,
40
+ title="API Key",
41
+ airbyte_secret=True,
42
+ description="The API key to use matching the environment",
43
+ )
44
+
45
+ api_url: str = Field(
46
+ default="https://api.unstructured.io",
47
+ title="API URL",
48
+ always_show=True,
49
+ description="The URL of the unstructured API to use",
50
+ examples=["https://api.unstructured.com"],
51
+ )
52
+
53
+ parameters: Optional[List[APIParameterConfigModel]] = Field(
54
+ default=[],
55
+ always_show=True,
56
+ title="Additional URL Parameters",
57
+ description="List of parameters send to the API",
58
+ )
59
+
60
+ class Config(OneOfOptionConfig):
61
+ title = "via API"
62
+ description = "Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured."
63
+ discriminator = "mode"
64
+
65
+
66
+ class UnstructuredFormat(BaseModel):
67
+ class Config(OneOfOptionConfig):
68
+ title = "Unstructured Document Format"
69
+ description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."
70
+ discriminator = "filetype"
71
+
72
+ filetype: str = Field(
73
+ "unstructured",
74
+ const=True,
75
+ )
76
+
77
+ skip_unprocessable_files: bool = Field(
78
+ default=True,
79
+ title="Skip Unprocessable Files",
80
+ description="If true, skip files that cannot be parsed and pass the error message along as the _ab_source_file_parse_error field. If false, fail the sync.",
81
+ always_show=True,
82
+ )
83
+
84
+ strategy: str = Field(
85
+ always_show=True,
86
+ order=0,
87
+ default="auto",
88
+ title="Parsing Strategy",
89
+ enum=["auto", "fast", "ocr_only", "hi_res"],
90
+ description="The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf",
91
+ )
92
+
93
+ processing: Union[
94
+ LocalProcessingConfigModel,
95
+ APIProcessingConfigModel,
96
+ ] = Field(
97
+ default=LocalProcessingConfigModel(mode="local"),
98
+ title="Processing",
99
+ description="Processing configuration",
100
+ discriminator="mode",
101
+ type="object",
102
+ )
@@ -0,0 +1,81 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import (
6
+ AbstractFileBasedSpec,
7
+ DeliverRawFiles,
8
+ )
9
+ from airbyte_cdk.sources.specs.transfer_modes import DeliverPermissions
10
+
11
+ DELIVERY_TYPE_KEY = "delivery_type"
12
+ DELIVERY_TYPE_PERMISSION_TRANSFER_MODE_VALUE = "use_permissions_transfer"
13
+ DELIVERY_TYPE_FILES_TRANSFER_MODE_VALUE = "use_file_transfer"
14
+ PRESERVE_DIRECTORY_STRUCTURE_KEY = "preserve_directory_structure"
15
+ INCLUDE_IDENTITIES_STREAM_KEY = "include_identities_stream"
16
+
17
+
18
+ def use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
19
+ """Returns `True` if the configuration uses file transfer mode."""
20
+ return (
21
+ hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY)
22
+ and parsed_config.delivery_method.delivery_type == DELIVERY_TYPE_FILES_TRANSFER_MODE_VALUE
23
+ )
24
+
25
+
26
+ def preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
27
+ """
28
+ Determines whether to preserve directory structure during file transfer.
29
+
30
+ When enabled, files maintain their subdirectory paths in the destination.
31
+ When disabled, files are flattened to the root of the destination.
32
+
33
+ Args:
34
+ parsed_config: The parsed configuration containing delivery method settings
35
+
36
+ Returns:
37
+ True if directory structure should be preserved (default), False otherwise
38
+ """
39
+ if (
40
+ use_file_transfer(parsed_config)
41
+ and hasattr(parsed_config.delivery_method, PRESERVE_DIRECTORY_STRUCTURE_KEY)
42
+ and isinstance(parsed_config.delivery_method, DeliverRawFiles)
43
+ ):
44
+ return parsed_config.delivery_method.preserve_directory_structure
45
+ return True
46
+
47
+
48
+ def use_permissions_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
49
+ """
50
+ Determines whether to use permissions transfer to sync ACLs and Identities
51
+
52
+ Args:
53
+ parsed_config: The parsed configuration containing delivery method settings
54
+
55
+ Returns:
56
+ True if permissions transfer should be enabled, False otherwise
57
+ """
58
+ return (
59
+ hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY)
60
+ and parsed_config.delivery_method.delivery_type
61
+ == DELIVERY_TYPE_PERMISSION_TRANSFER_MODE_VALUE
62
+ )
63
+
64
+
65
+ def include_identities_stream(parsed_config: AbstractFileBasedSpec) -> bool:
66
+ """
67
+ There are scenarios where user may not have access to identities but still is valuable to get ACLs
68
+
69
+ Args:
70
+ parsed_config: The parsed configuration containing delivery method settings
71
+
72
+ Returns:
73
+ True if we should include Identities stream.
74
+ """
75
+ if (
76
+ use_permissions_transfer(parsed_config)
77
+ and hasattr(parsed_config.delivery_method, INCLUDE_IDENTITIES_STREAM_KEY)
78
+ and isinstance(parsed_config.delivery_method, DeliverPermissions)
79
+ ):
80
+ return parsed_config.delivery_method.include_identities_stream
81
+ return False
@@ -0,0 +1,8 @@
1
+ from airbyte_cdk.sources.file_based.discovery_policy.abstract_discovery_policy import (
2
+ AbstractDiscoveryPolicy,
3
+ )
4
+ from airbyte_cdk.sources.file_based.discovery_policy.default_discovery_policy import (
5
+ DefaultDiscoveryPolicy,
6
+ )
7
+
8
+ __all__ = ["AbstractDiscoveryPolicy", "DefaultDiscoveryPolicy"]
@@ -0,0 +1,21 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from abc import ABC, abstractmethod
6
+
7
+ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
8
+
9
+
10
+ class AbstractDiscoveryPolicy(ABC):
11
+ """
12
+ Used during discovery; allows the developer to configure the number of concurrent
13
+ requests to send to the source, and the number of files to use for schema discovery.
14
+ """
15
+
16
+ @property
17
+ @abstractmethod
18
+ def n_concurrent_requests(self) -> int: ...
19
+
20
+ @abstractmethod
21
+ def get_max_n_files_for_schema_inference(self, parser: FileTypeParser) -> int: ...
@@ -0,0 +1,33 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from airbyte_cdk.sources.file_based.discovery_policy.abstract_discovery_policy import (
6
+ AbstractDiscoveryPolicy,
7
+ )
8
+ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
9
+
10
+ DEFAULT_N_CONCURRENT_REQUESTS = 10
11
+ DEFAULT_MAX_N_FILES_FOR_STREAM_SCHEMA_INFERENCE = 10
12
+
13
+
14
+ class DefaultDiscoveryPolicy(AbstractDiscoveryPolicy):
15
+ """
16
+ Default number of concurrent requests to send to the source on discover, and number
17
+ of files to use for schema inference.
18
+ """
19
+
20
+ @property
21
+ def n_concurrent_requests(self) -> int:
22
+ return DEFAULT_N_CONCURRENT_REQUESTS
23
+
24
+ def get_max_n_files_for_schema_inference(self, parser: FileTypeParser) -> int:
25
+ return min(
26
+ filter(
27
+ None,
28
+ (
29
+ DEFAULT_MAX_N_FILES_FOR_STREAM_SCHEMA_INFERENCE,
30
+ parser.parser_max_n_files_for_schema_inference,
31
+ ),
32
+ )
33
+ )