airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,480 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+ import logging
5
+ import os
6
+ import traceback
7
+ from datetime import datetime
8
+ from io import BytesIO, IOBase
9
+ from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
10
+
11
+ import backoff
12
+ import dpath
13
+ import nltk
14
+ import requests
15
+ from unstructured.file_utils.filetype import (
16
+ EXT_TO_FILETYPE,
17
+ FILETYPE_TO_MIMETYPE,
18
+ STR_TO_FILETYPE,
19
+ FileType,
20
+ detect_filetype,
21
+ )
22
+
23
+ from airbyte_cdk.models import FailureType
24
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
25
+ from airbyte_cdk.sources.file_based.config.unstructured_format import (
26
+ APIParameterConfigModel,
27
+ APIProcessingConfigModel,
28
+ LocalProcessingConfigModel,
29
+ UnstructuredFormat,
30
+ )
31
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
32
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
33
+ AbstractFileBasedStreamReader,
34
+ FileReadMode,
35
+ )
36
+ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
37
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
38
+ from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
39
+ from airbyte_cdk.utils import is_cloud_environment
40
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
41
+
42
+ unstructured_partition_pdf = None
43
+ unstructured_partition_docx = None
44
+ unstructured_partition_pptx = None
45
+
46
+ AIRBYTE_NLTK_DATA_DIR = "/airbyte/nltk_data"
47
+ TMP_NLTK_DATA_DIR = "/tmp/nltk_data"
48
+
49
+
50
+ def get_nltk_temp_folder() -> str:
51
+ """
52
+ For non-root connectors /tmp is not currently writable, but we should allow it in the future.
53
+ It's safe to use /airbyte for now. Fallback to /tmp for local development.
54
+ """
55
+ try:
56
+ nltk_data_dir = AIRBYTE_NLTK_DATA_DIR
57
+ os.makedirs(nltk_data_dir, exist_ok=True)
58
+ except OSError:
59
+ nltk_data_dir = TMP_NLTK_DATA_DIR
60
+ os.makedirs(nltk_data_dir, exist_ok=True)
61
+ return nltk_data_dir
62
+
63
+
64
+ try:
65
+ nltk_data_dir = get_nltk_temp_folder()
66
+ nltk.data.path.append(nltk_data_dir)
67
+ nltk.data.find("tokenizers/punkt.zip")
68
+ nltk.data.find("tokenizers/punkt_tab.zip")
69
+ nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
70
+ except LookupError:
71
+ nltk.download("punkt", download_dir=nltk_data_dir, quiet=True)
72
+ nltk.download("punkt_tab", download_dir=nltk_data_dir, quiet=True)
73
+ nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir, quiet=True)
74
+
75
+
76
+ def optional_decode(contents: Union[str, bytes]) -> str:
77
+ if isinstance(contents, bytes):
78
+ return contents.decode("utf-8")
79
+ return contents
80
+
81
+
82
+ def _import_unstructured() -> None:
83
+ """Dynamically imported as needed, due to slow import speed."""
84
+ global unstructured_partition_pdf
85
+ global unstructured_partition_docx
86
+ global unstructured_partition_pptx
87
+ from unstructured.partition.docx import partition_docx
88
+ from unstructured.partition.pdf import partition_pdf
89
+ from unstructured.partition.pptx import partition_pptx
90
+
91
+ # separate global variables to properly propagate typing
92
+ unstructured_partition_pdf = partition_pdf
93
+ unstructured_partition_docx = partition_docx
94
+ unstructured_partition_pptx = partition_pptx
95
+
96
+
97
+ def user_error(e: Exception) -> bool:
98
+ """
99
+ Return True if this exception is caused by user error, False otherwise.
100
+ """
101
+ if not isinstance(e, RecordParseError):
102
+ return False
103
+ if not isinstance(e, requests.exceptions.RequestException):
104
+ return False
105
+ return bool(e.response and 400 <= e.response.status_code < 500)
106
+
107
+
108
+ CLOUD_DEPLOYMENT_MODE = "cloud"
109
+
110
+
111
+ class UnstructuredParser(FileTypeParser):
112
+ @property
113
+ def parser_max_n_files_for_schema_inference(self) -> Optional[int]:
114
+ """
115
+ Just check one file as the schema is static
116
+ """
117
+ return 1
118
+
119
+ @property
120
+ def parser_max_n_files_for_parsability(self) -> Optional[int]:
121
+ """
122
+ Do not check any files for parsability because it might be an expensive operation and doesn't give much confidence whether the sync will succeed.
123
+ """
124
+ return 0
125
+
126
+ def get_parser_defined_primary_key(self, config: FileBasedStreamConfig) -> Optional[str]:
127
+ """
128
+ Return the document_key field as the primary key.
129
+
130
+ his will pre-select the document key column as the primary key when setting up a connection, making it easier for the user to configure normalization in the destination.
131
+ """
132
+ return "document_key"
133
+
134
+ async def infer_schema(
135
+ self,
136
+ config: FileBasedStreamConfig,
137
+ file: RemoteFile,
138
+ stream_reader: AbstractFileBasedStreamReader,
139
+ logger: logging.Logger,
140
+ ) -> SchemaType:
141
+ format = _extract_format(config)
142
+ with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
143
+ filetype = self._get_filetype(file_handle, file)
144
+ if filetype not in self._supported_file_types() and not format.skip_unprocessable_files:
145
+ raise self._create_parse_error(
146
+ file,
147
+ self._get_file_type_error_message(filetype),
148
+ )
149
+
150
+ return {
151
+ "content": {
152
+ "type": "string",
153
+ "description": "Content of the file as markdown. Might be null if the file could not be parsed",
154
+ },
155
+ "document_key": {
156
+ "type": "string",
157
+ "description": "Unique identifier of the document, e.g. the file path",
158
+ },
159
+ "_ab_source_file_parse_error": {
160
+ "type": "string",
161
+ "description": "Error message if the file could not be parsed even though the file is supported",
162
+ },
163
+ }
164
+
165
+ def parse_records(
166
+ self,
167
+ config: FileBasedStreamConfig,
168
+ file: RemoteFile,
169
+ stream_reader: AbstractFileBasedStreamReader,
170
+ logger: logging.Logger,
171
+ discovered_schema: Optional[Mapping[str, SchemaType]],
172
+ ) -> Iterable[Dict[str, Any]]:
173
+ format = _extract_format(config)
174
+ with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
175
+ try:
176
+ markdown = self._read_file(file_handle, file, format, logger)
177
+ yield {
178
+ "content": markdown,
179
+ "document_key": file.uri,
180
+ "_ab_source_file_parse_error": None,
181
+ }
182
+ except RecordParseError as e:
183
+ # RecordParseError is raised when the file can't be parsed because of a problem with the file content (either the file is not supported or the file is corrupted)
184
+ # if the skip_unprocessable_files flag is set, we log a warning and pass the error as part of the document
185
+ # otherwise, we raise the error to fail the sync
186
+ if format.skip_unprocessable_files:
187
+ exception_str = str(e)
188
+ logger.warn(f"File {file.uri} caused an error during parsing: {exception_str}.")
189
+ yield {
190
+ "content": None,
191
+ "document_key": file.uri,
192
+ "_ab_source_file_parse_error": exception_str,
193
+ }
194
+ logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
195
+ else:
196
+ raise e
197
+ except Exception as e:
198
+ exception_str = str(e)
199
+ logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
200
+ raise e
201
+
202
+ def _read_file(
203
+ self,
204
+ file_handle: IOBase,
205
+ remote_file: RemoteFile,
206
+ format: UnstructuredFormat,
207
+ logger: logging.Logger,
208
+ ) -> str:
209
+ _import_unstructured()
210
+ if (
211
+ (not unstructured_partition_pdf)
212
+ or (not unstructured_partition_docx)
213
+ or (not unstructured_partition_pptx)
214
+ ):
215
+ # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
216
+ raise Exception("unstructured library is not available")
217
+
218
+ filetype: FileType | None = self._get_filetype(file_handle, remote_file)
219
+
220
+ if filetype is None or filetype not in self._supported_file_types():
221
+ raise self._create_parse_error(
222
+ remote_file,
223
+ self._get_file_type_error_message(filetype),
224
+ )
225
+ if filetype in {FileType.MD, FileType.TXT}:
226
+ file_content: bytes = file_handle.read()
227
+ decoded_content: str = optional_decode(file_content)
228
+ return decoded_content
229
+ if format.processing.mode == "local":
230
+ return self._read_file_locally(
231
+ file_handle,
232
+ filetype,
233
+ format.strategy,
234
+ remote_file,
235
+ )
236
+ elif format.processing.mode == "api":
237
+ try:
238
+ result: str = self._read_file_remotely_with_retries(
239
+ file_handle,
240
+ format.processing,
241
+ filetype,
242
+ format.strategy,
243
+ remote_file,
244
+ )
245
+ except Exception as e:
246
+ # If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
247
+ #
248
+ # For other exceptions, re-throw as config error so the sync is stopped as problems with the external API need to be resolved by the user and are not considered part of the SLA.
249
+ # Once this parser leaves experimental stage, we should consider making this a system error instead for issues that might be transient.
250
+ if isinstance(e, RecordParseError):
251
+ raise e
252
+ raise AirbyteTracedException.from_exception(
253
+ e, failure_type=FailureType.config_error
254
+ )
255
+
256
+ return result
257
+
258
+ def _params_to_dict(
259
+ self, params: Optional[List[APIParameterConfigModel]], strategy: str
260
+ ) -> Dict[str, Union[str, List[str]]]:
261
+ result_dict: Dict[str, Union[str, List[str]]] = {"strategy": strategy}
262
+ if params is None:
263
+ return result_dict
264
+ for item in params:
265
+ key = item.name
266
+ value = item.value
267
+ if key in result_dict:
268
+ existing_value = result_dict[key]
269
+ # If the key already exists, append the new value to its list
270
+ if isinstance(existing_value, list):
271
+ existing_value.append(value)
272
+ else:
273
+ result_dict[key] = [existing_value, value]
274
+ else:
275
+ # If the key doesn't exist, add it to the dictionary
276
+ result_dict[key] = value
277
+
278
+ return result_dict
279
+
280
+ def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
281
+ """
282
+ Perform a connection check for the parser config:
283
+ - Verify that encryption is enabled if the API is hosted on a cloud instance.
284
+ - Verify that the API can extract text from a file.
285
+
286
+ For local processing, we don't need to perform any additional checks, implicit pydantic validation is enough.
287
+ """
288
+ format_config = _extract_format(config)
289
+ if isinstance(format_config.processing, LocalProcessingConfigModel):
290
+ if format_config.strategy == "hi_res":
291
+ return False, "Hi-res strategy is not supported for local processing"
292
+ return True, None
293
+
294
+ if is_cloud_environment() and not format_config.processing.api_url.startswith("https://"):
295
+ return False, "Base URL must start with https://"
296
+
297
+ try:
298
+ self._read_file_remotely(
299
+ BytesIO(b"# Airbyte source connection test"),
300
+ format_config.processing,
301
+ FileType.MD,
302
+ "auto",
303
+ RemoteFile(uri="test", last_modified=datetime.now()),
304
+ )
305
+ except Exception:
306
+ return False, "".join(traceback.format_exc())
307
+
308
+ return True, None
309
+
310
+ @backoff.on_exception(
311
+ backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error
312
+ )
313
+ def _read_file_remotely_with_retries(
314
+ self,
315
+ file_handle: IOBase,
316
+ format: APIProcessingConfigModel,
317
+ filetype: FileType,
318
+ strategy: str,
319
+ remote_file: RemoteFile,
320
+ ) -> str:
321
+ """
322
+ Read a file remotely, retrying up to 5 times if the error is not caused by user error. This is useful for transient network errors or the API server being overloaded temporarily.
323
+ """
324
+ return self._read_file_remotely(file_handle, format, filetype, strategy, remote_file)
325
+
326
+ def _read_file_remotely(
327
+ self,
328
+ file_handle: IOBase,
329
+ format: APIProcessingConfigModel,
330
+ filetype: FileType,
331
+ strategy: str,
332
+ remote_file: RemoteFile,
333
+ ) -> str:
334
+ headers = {"accept": "application/json", "unstructured-api-key": format.api_key}
335
+
336
+ data = self._params_to_dict(format.parameters, strategy)
337
+
338
+ file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
339
+
340
+ response = requests.post(
341
+ f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data
342
+ )
343
+
344
+ if response.status_code == 422:
345
+ # 422 means the file couldn't be processed, but the API is working. Treat this as a parsing error (passing an error record to the destination).
346
+ raise self._create_parse_error(remote_file, response.json())
347
+ else:
348
+ # Other error statuses are raised as requests exceptions (retry everything except user errors)
349
+ response.raise_for_status()
350
+
351
+ json_response = response.json()
352
+
353
+ return self._render_markdown(json_response)
354
+
355
+ def _read_file_locally(
356
+ self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile
357
+ ) -> str:
358
+ _import_unstructured()
359
+ if (
360
+ (not unstructured_partition_pdf)
361
+ or (not unstructured_partition_docx)
362
+ or (not unstructured_partition_pptx)
363
+ ):
364
+ # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
365
+ raise Exception("unstructured library is not available")
366
+
367
+ file: Any = file_handle
368
+
369
+ # before the parsing logic is entered, the file is read completely to make sure it is in local memory
370
+ file_handle.seek(0)
371
+ file_handle.read()
372
+ file_handle.seek(0)
373
+
374
+ try:
375
+ if filetype == FileType.PDF:
376
+ # for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects
377
+ file_handle.seek(0)
378
+ with BytesIO(file_handle.read()) as file:
379
+ file_handle.seek(0)
380
+ elements = unstructured_partition_pdf(file=file, strategy=strategy)
381
+ elif filetype == FileType.DOCX:
382
+ elements = unstructured_partition_docx(file=file)
383
+ elif filetype == FileType.PPTX:
384
+ elements = unstructured_partition_pptx(file=file)
385
+ except Exception as e:
386
+ raise self._create_parse_error(remote_file, str(e))
387
+
388
+ return self._render_markdown([element.to_dict() for element in elements])
389
+
390
+ def _create_parse_error(
391
+ self,
392
+ remote_file: RemoteFile,
393
+ message: str,
394
+ ) -> RecordParseError:
395
+ return RecordParseError(
396
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
397
+ )
398
+
399
+ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileType]:
400
+ """
401
+ Detect the file type based on the file name and the file content.
402
+
403
+ There are three strategies to determine the file type:
404
+ 1. Use the mime type if available (only some sources support it)
405
+ 2. Use the file name if available
406
+ 3. Use the file content
407
+ """
408
+ if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE:
409
+ return STR_TO_FILETYPE[remote_file.mime_type]
410
+
411
+ # set name to none, otherwise unstructured will try to get the modified date from the local file system
412
+ if hasattr(file, "name"):
413
+ file.name = None
414
+
415
+ # detect_filetype is either using the file name or file content
416
+ # if possible, try to leverage the file name to detect the file type
417
+ # if the file name is not available, use the file content
418
+ file_type: FileType | None = None
419
+ try:
420
+ file_type = detect_filetype(
421
+ filename=remote_file.uri,
422
+ )
423
+ except Exception:
424
+ # Path doesn't exist locally. Try something else...
425
+ pass
426
+
427
+ if file_type and file_type != FileType.UNK:
428
+ return file_type
429
+
430
+ type_based_on_content = detect_filetype(file=file)
431
+ file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
432
+
433
+ if type_based_on_content and type_based_on_content != FileType.UNK:
434
+ return type_based_on_content
435
+
436
+ extension = "." + remote_file.uri.split(".")[-1].lower()
437
+ if extension in EXT_TO_FILETYPE:
438
+ return EXT_TO_FILETYPE[extension]
439
+
440
+ return None
441
+
442
+ def _supported_file_types(self) -> List[Any]:
443
+ return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
444
+
445
+ def _get_file_type_error_message(
446
+ self,
447
+ file_type: FileType | None,
448
+ ) -> str:
449
+ supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
450
+ return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"
451
+
452
+ def _render_markdown(self, elements: List[Any]) -> str:
453
+ return "\n\n".join((self._convert_to_markdown(el) for el in elements))
454
+
455
+ def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
456
+ if dpath.get(el, "type") == "Title":
457
+ category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
458
+ if not isinstance(category_depth, int):
459
+ category_depth = (
460
+ int(category_depth) if isinstance(category_depth, (str, float)) else 1
461
+ )
462
+ heading_str = "#" * category_depth
463
+ return f"{heading_str} {dpath.get(el, 'text')}"
464
+ elif dpath.get(el, "type") == "ListItem":
465
+ return f"- {dpath.get(el, 'text')}"
466
+ elif dpath.get(el, "type") == "Formula":
467
+ return f"```\n{dpath.get(el, 'text')}\n```"
468
+ else:
469
+ return str(dpath.get(el, "text", default=""))
470
+
471
+ @property
472
+ def file_read_mode(self) -> FileReadMode:
473
+ return FileReadMode.READ_BINARY
474
+
475
+
476
+ def _extract_format(config: FileBasedStreamConfig) -> UnstructuredFormat:
477
+ config_format = config.format
478
+ if not isinstance(config_format, UnstructuredFormat):
479
+ raise ValueError(f"Invalid format config: {config_format}")
480
+ return config_format
@@ -0,0 +1,18 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from datetime import datetime
6
+ from typing import Optional
7
+
8
+ from pydantic.v1 import BaseModel
9
+
10
+
11
+ class RemoteFile(BaseModel):
12
+ """
13
+ A file in a file-based stream.
14
+ """
15
+
16
+ uri: str
17
+ last_modified: datetime
18
+ mime_type: Optional[str] = None