airbyte-cdk 0.72.1__py3-none-any.whl → 6.13.1.dev4107__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (517) hide show
  1. airbyte_cdk/__init__.py +355 -6
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +29 -10
  7. airbyte_cdk/connector.py +24 -24
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
  10. airbyte_cdk/connector_builder/main.py +45 -13
  11. airbyte_cdk/connector_builder/message_grouper.py +189 -50
  12. airbyte_cdk/connector_builder/models.py +3 -2
  13. airbyte_cdk/destinations/__init__.py +4 -3
  14. airbyte_cdk/destinations/destination.py +54 -20
  15. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  16. airbyte_cdk/destinations/vector_db_based/config.py +40 -17
  17. airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
  18. airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
  19. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  20. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  21. airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
  22. airbyte_cdk/entrypoint.py +153 -44
  23. airbyte_cdk/exception_handler.py +21 -3
  24. airbyte_cdk/logger.py +30 -44
  25. airbyte_cdk/models/__init__.py +13 -2
  26. airbyte_cdk/models/airbyte_protocol.py +86 -1
  27. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  28. airbyte_cdk/models/file_transfer_record_message.py +13 -0
  29. airbyte_cdk/models/well_known_types.py +1 -1
  30. airbyte_cdk/sources/__init__.py +5 -1
  31. airbyte_cdk/sources/abstract_source.py +125 -79
  32. airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
  33. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
  34. airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
  35. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
  36. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  37. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
  38. airbyte_cdk/sources/config.py +3 -2
  39. airbyte_cdk/sources/connector_state_manager.py +49 -83
  40. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  41. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
  42. airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
  43. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  44. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  45. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  46. airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
  47. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  48. airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
  49. airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
  50. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
  51. airbyte_cdk/sources/declarative/auth/token.py +28 -10
  52. airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
  53. airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
  54. airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
  55. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  56. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  57. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +421 -0
  58. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
  59. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
  60. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1185 -85
  61. airbyte_cdk/sources/declarative/declarative_source.py +5 -2
  62. airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
  63. airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
  64. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  65. airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
  66. airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
  67. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  68. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  69. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  70. airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
  71. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
  72. airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
  73. airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
  74. airbyte_cdk/sources/declarative/extractors/record_filter.py +65 -8
  75. airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
  76. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
  77. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  78. airbyte_cdk/sources/declarative/incremental/__init__.py +25 -3
  79. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
  80. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  81. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
  82. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +159 -74
  83. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  84. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  85. airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
  86. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
  87. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
  88. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
  89. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
  90. airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
  91. airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
  92. airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
  93. airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
  94. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  95. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  96. airbyte_cdk/sources/declarative/models/__init__.py +1 -1
  97. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1319 -603
  98. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
  99. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
  100. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
  101. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1695 -225
  102. airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
  103. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  104. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  105. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
  106. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  107. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
  108. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
  109. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  110. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  111. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
  112. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
  113. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
  114. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
  115. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
  116. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
  117. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
  118. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
  119. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  120. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
  121. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
  122. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
  123. airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
  124. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  125. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
  126. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
  127. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
  128. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  129. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
  130. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
  131. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
  132. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
  133. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
  134. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
  135. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
  136. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  137. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
  138. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
  139. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
  140. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
  141. airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
  142. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  143. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  144. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  145. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  146. airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
  147. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
  148. airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
  149. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +228 -72
  150. airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
  151. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
  152. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
  153. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
  154. airbyte_cdk/sources/declarative/spec/spec.py +12 -5
  155. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
  156. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
  157. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
  158. airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
  159. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  160. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  161. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  162. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  163. airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
  164. airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
  165. airbyte_cdk/sources/declarative/types.py +19 -110
  166. airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
  167. airbyte_cdk/sources/embedded/base_integration.py +16 -5
  168. airbyte_cdk/sources/embedded/catalog.py +16 -4
  169. airbyte_cdk/sources/embedded/runner.py +19 -3
  170. airbyte_cdk/sources/embedded/tools.py +5 -2
  171. airbyte_cdk/sources/file_based/README.md +152 -0
  172. airbyte_cdk/sources/file_based/__init__.py +24 -0
  173. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  174. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
  175. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
  176. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +58 -10
  177. airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
  178. airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
  179. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  180. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
  181. airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
  182. airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
  183. airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
  184. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  185. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  186. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  187. airbyte_cdk/sources/file_based/exceptions.py +52 -15
  188. airbyte_cdk/sources/file_based/file_based_source.py +163 -33
  189. airbyte_cdk/sources/file_based/file_based_stream_reader.py +83 -5
  190. airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
  191. airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
  192. airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
  193. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  194. airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
  195. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  196. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
  197. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
  198. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +147 -41
  199. airbyte_cdk/sources/file_based/remote_file.py +1 -1
  200. airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
  201. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  202. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  203. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  204. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
  205. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
  206. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  207. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
  208. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
  209. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
  210. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  211. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
  212. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +175 -45
  213. airbyte_cdk/sources/http_logger.py +8 -3
  214. airbyte_cdk/sources/message/__init__.py +7 -1
  215. airbyte_cdk/sources/message/repository.py +18 -4
  216. airbyte_cdk/sources/source.py +42 -38
  217. airbyte_cdk/sources/streams/__init__.py +2 -2
  218. airbyte_cdk/sources/streams/availability_strategy.py +54 -3
  219. airbyte_cdk/sources/streams/call_rate.py +64 -21
  220. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  221. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  222. airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
  223. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  224. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  225. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  226. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  227. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
  228. airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
  229. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
  230. airbyte_cdk/sources/streams/concurrent/cursor.py +298 -42
  231. airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
  232. airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
  233. airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
  234. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
  235. airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
  236. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
  237. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  238. airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
  239. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
  240. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
  241. airbyte_cdk/sources/streams/core.py +412 -87
  242. airbyte_cdk/sources/streams/http/__init__.py +2 -1
  243. airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
  244. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  245. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  246. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  247. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  248. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  249. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  250. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  251. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  252. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  253. airbyte_cdk/sources/streams/http/exceptions.py +27 -7
  254. airbyte_cdk/sources/streams/http/http.py +369 -246
  255. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  256. airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
  257. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
  258. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  259. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
  260. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  261. airbyte_cdk/sources/types.py +154 -0
  262. airbyte_cdk/sources/utils/record_helper.py +36 -21
  263. airbyte_cdk/sources/utils/schema_helpers.py +13 -6
  264. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  265. airbyte_cdk/sources/utils/transform.py +54 -20
  266. airbyte_cdk/sql/_util/hashing.py +34 -0
  267. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  268. airbyte_cdk/sql/constants.py +32 -0
  269. airbyte_cdk/sql/exceptions.py +235 -0
  270. airbyte_cdk/sql/secrets.py +123 -0
  271. airbyte_cdk/sql/shared/__init__.py +15 -0
  272. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  273. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  274. airbyte_cdk/sql/types.py +160 -0
  275. airbyte_cdk/test/catalog_builder.py +70 -18
  276. airbyte_cdk/test/entrypoint_wrapper.py +117 -42
  277. airbyte_cdk/test/mock_http/__init__.py +1 -1
  278. airbyte_cdk/test/mock_http/matcher.py +6 -0
  279. airbyte_cdk/test/mock_http/mocker.py +57 -10
  280. airbyte_cdk/test/mock_http/request.py +19 -3
  281. airbyte_cdk/test/mock_http/response.py +3 -1
  282. airbyte_cdk/test/mock_http/response_builder.py +32 -16
  283. airbyte_cdk/test/state_builder.py +18 -10
  284. airbyte_cdk/test/utils/__init__.py +1 -0
  285. airbyte_cdk/test/utils/data.py +24 -0
  286. airbyte_cdk/test/utils/http_mocking.py +16 -0
  287. airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
  288. airbyte_cdk/test/utils/reading.py +26 -0
  289. airbyte_cdk/utils/__init__.py +2 -1
  290. airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
  291. airbyte_cdk/utils/analytics_message.py +10 -2
  292. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  293. airbyte_cdk/utils/event_timing.py +10 -10
  294. airbyte_cdk/utils/mapping_helpers.py +3 -1
  295. airbyte_cdk/utils/message_utils.py +20 -11
  296. airbyte_cdk/utils/print_buffer.py +75 -0
  297. airbyte_cdk/utils/schema_inferrer.py +198 -28
  298. airbyte_cdk/utils/slice_hasher.py +30 -0
  299. airbyte_cdk/utils/spec_schema_transformations.py +6 -3
  300. airbyte_cdk/utils/stream_status_utils.py +8 -1
  301. airbyte_cdk/utils/traced_exception.py +61 -21
  302. airbyte_cdk-6.13.1.dev4107.dist-info/METADATA +109 -0
  303. airbyte_cdk-6.13.1.dev4107.dist-info/RECORD +349 -0
  304. {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.13.1.dev4107.dist-info}/WHEEL +1 -2
  305. airbyte_cdk-6.13.1.dev4107.dist-info/entry_points.txt +3 -0
  306. airbyte_cdk/sources/declarative/create_partial.py +0 -92
  307. airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
  308. airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
  309. airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
  310. airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
  311. airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
  312. airbyte_cdk/sources/deprecated/base_source.py +0 -94
  313. airbyte_cdk/sources/deprecated/client.py +0 -99
  314. airbyte_cdk/sources/singer/__init__.py +0 -8
  315. airbyte_cdk/sources/singer/singer_helpers.py +0 -304
  316. airbyte_cdk/sources/singer/source.py +0 -186
  317. airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
  318. airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
  319. airbyte_cdk/sources/streams/http/auth/core.py +0 -29
  320. airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
  321. airbyte_cdk/sources/streams/http/auth/token.py +0 -47
  322. airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
  323. airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
  324. airbyte_cdk/sources/utils/schema_models.py +0 -84
  325. airbyte_cdk-0.72.1.dist-info/METADATA +0 -243
  326. airbyte_cdk-0.72.1.dist-info/RECORD +0 -466
  327. airbyte_cdk-0.72.1.dist-info/top_level.txt +0 -3
  328. source_declarative_manifest/main.py +0 -29
  329. unit_tests/connector_builder/__init__.py +0 -3
  330. unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
  331. unit_tests/connector_builder/test_message_grouper.py +0 -713
  332. unit_tests/connector_builder/utils.py +0 -27
  333. unit_tests/destinations/test_destination.py +0 -243
  334. unit_tests/singer/test_singer_helpers.py +0 -56
  335. unit_tests/singer/test_singer_source.py +0 -112
  336. unit_tests/sources/__init__.py +0 -0
  337. unit_tests/sources/concurrent_source/__init__.py +0 -3
  338. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
  339. unit_tests/sources/declarative/__init__.py +0 -3
  340. unit_tests/sources/declarative/auth/__init__.py +0 -3
  341. unit_tests/sources/declarative/auth/test_oauth.py +0 -331
  342. unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
  343. unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
  344. unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
  345. unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
  346. unit_tests/sources/declarative/checks/__init__.py +0 -3
  347. unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
  348. unit_tests/sources/declarative/decoders/__init__.py +0 -0
  349. unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
  350. unit_tests/sources/declarative/external_component.py +0 -13
  351. unit_tests/sources/declarative/extractors/__init__.py +0 -3
  352. unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
  353. unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
  354. unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
  355. unit_tests/sources/declarative/incremental/__init__.py +0 -0
  356. unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
  357. unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
  358. unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
  359. unit_tests/sources/declarative/interpolation/__init__.py +0 -3
  360. unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
  361. unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
  362. unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
  363. unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
  364. unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
  365. unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
  366. unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
  367. unit_tests/sources/declarative/parsers/__init__.py +0 -3
  368. unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
  369. unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
  370. unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1847
  371. unit_tests/sources/declarative/parsers/testing_components.py +0 -36
  372. unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
  373. unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
  374. unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
  375. unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
  376. unit_tests/sources/declarative/requesters/__init__.py +0 -3
  377. unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
  378. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
  379. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
  380. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
  381. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
  382. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
  383. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
  384. unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
  385. unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
  386. unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
  387. unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
  388. unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
  389. unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
  390. unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
  391. unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
  392. unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
  393. unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
  394. unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
  395. unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
  396. unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
  397. unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
  398. unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
  399. unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
  400. unit_tests/sources/declarative/retrievers/__init__.py +0 -3
  401. unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
  402. unit_tests/sources/declarative/schema/__init__.py +0 -6
  403. unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
  404. unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
  405. unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
  406. unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
  407. unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
  408. unit_tests/sources/declarative/states/__init__.py +0 -3
  409. unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
  410. unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
  411. unit_tests/sources/declarative/test_create_partial.py +0 -83
  412. unit_tests/sources/declarative/test_declarative_stream.py +0 -103
  413. unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
  414. unit_tests/sources/declarative/test_types.py +0 -39
  415. unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
  416. unit_tests/sources/file_based/__init__.py +0 -0
  417. unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
  418. unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
  419. unit_tests/sources/file_based/config/__init__.py +0 -0
  420. unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
  421. unit_tests/sources/file_based/config/test_csv_format.py +0 -34
  422. unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
  423. unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
  424. unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
  425. unit_tests/sources/file_based/file_types/__init__.py +0 -0
  426. unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
  427. unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
  428. unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
  429. unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
  430. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
  431. unit_tests/sources/file_based/helpers.py +0 -70
  432. unit_tests/sources/file_based/in_memory_files_source.py +0 -211
  433. unit_tests/sources/file_based/scenarios/__init__.py +0 -0
  434. unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
  435. unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
  436. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
  437. unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
  438. unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
  439. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
  440. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
  441. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
  442. unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
  443. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
  444. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
  445. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
  446. unit_tests/sources/file_based/stream/__init__.py +0 -0
  447. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  448. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
  449. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
  450. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
  451. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
  452. unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
  453. unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
  454. unit_tests/sources/file_based/test_scenarios.py +0 -253
  455. unit_tests/sources/file_based/test_schema_helpers.py +0 -346
  456. unit_tests/sources/fixtures/__init__.py +0 -3
  457. unit_tests/sources/fixtures/source_test_fixture.py +0 -153
  458. unit_tests/sources/message/__init__.py +0 -0
  459. unit_tests/sources/message/test_repository.py +0 -153
  460. unit_tests/sources/streams/__init__.py +0 -0
  461. unit_tests/sources/streams/concurrent/__init__.py +0 -3
  462. unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
  463. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
  464. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
  465. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
  466. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
  467. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
  468. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
  469. unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
  470. unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
  471. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
  472. unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
  473. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
  474. unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
  475. unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
  476. unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
  477. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
  478. unit_tests/sources/streams/http/__init__.py +0 -0
  479. unit_tests/sources/streams/http/auth/__init__.py +0 -0
  480. unit_tests/sources/streams/http/auth/test_auth.py +0 -173
  481. unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
  482. unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
  483. unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
  484. unit_tests/sources/streams/http/test_http.py +0 -635
  485. unit_tests/sources/streams/test_availability_strategy.py +0 -70
  486. unit_tests/sources/streams/test_call_rate.py +0 -300
  487. unit_tests/sources/streams/test_stream_read.py +0 -405
  488. unit_tests/sources/streams/test_streams_core.py +0 -184
  489. unit_tests/sources/test_abstract_source.py +0 -1442
  490. unit_tests/sources/test_concurrent_source.py +0 -112
  491. unit_tests/sources/test_config.py +0 -92
  492. unit_tests/sources/test_connector_state_manager.py +0 -482
  493. unit_tests/sources/test_http_logger.py +0 -252
  494. unit_tests/sources/test_integration_source.py +0 -86
  495. unit_tests/sources/test_source.py +0 -684
  496. unit_tests/sources/test_source_read.py +0 -460
  497. unit_tests/test/__init__.py +0 -0
  498. unit_tests/test/mock_http/__init__.py +0 -0
  499. unit_tests/test/mock_http/test_matcher.py +0 -53
  500. unit_tests/test/mock_http/test_mocker.py +0 -214
  501. unit_tests/test/mock_http/test_request.py +0 -117
  502. unit_tests/test/mock_http/test_response_builder.py +0 -177
  503. unit_tests/test/test_entrypoint_wrapper.py +0 -240
  504. unit_tests/utils/__init__.py +0 -0
  505. unit_tests/utils/test_datetime_format_inferrer.py +0 -60
  506. unit_tests/utils/test_mapping_helpers.py +0 -54
  507. unit_tests/utils/test_message_utils.py +0 -91
  508. unit_tests/utils/test_rate_limiting.py +0 -26
  509. unit_tests/utils/test_schema_inferrer.py +0 -202
  510. unit_tests/utils/test_secret_utils.py +0 -135
  511. unit_tests/utils/test_stream_status_utils.py +0 -61
  512. unit_tests/utils/test_traced_exception.py +0 -107
  513. /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
  514. {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
  515. {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
  516. {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
  517. {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.13.1.dev4107.dist-info}/LICENSE.txt +0 -0
@@ -0,0 +1,37 @@
1
+ #
2
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3
+ #
4
+ import logging
5
+ import os
6
+ from typing import Any, Dict, Iterable
7
+
8
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
9
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
10
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
+
12
+ AIRBYTE_STAGING_DIRECTORY = os.getenv("AIRBYTE_STAGING_DIRECTORY", "/staging/files")
13
+ DEFAULT_LOCAL_DIRECTORY = "/tmp/airbyte-file-transfer"
14
+
15
+
16
+ class FileTransfer:
17
+ def __init__(self) -> None:
18
+ self._local_directory = (
19
+ AIRBYTE_STAGING_DIRECTORY
20
+ if os.path.exists(AIRBYTE_STAGING_DIRECTORY)
21
+ else DEFAULT_LOCAL_DIRECTORY
22
+ )
23
+
24
+ def get_file(
25
+ self,
26
+ config: FileBasedStreamConfig,
27
+ file: RemoteFile,
28
+ stream_reader: AbstractFileBasedStreamReader,
29
+ logger: logging.Logger,
30
+ ) -> Iterable[Dict[str, Any]]:
31
+ try:
32
+ yield stream_reader.get_file(
33
+ file=file, local_directory=self._local_directory, logger=logger
34
+ )
35
+ except Exception as ex:
36
+ logger.error("An error has occurred while getting file: %s", str(ex))
37
+ raise ex
@@ -7,7 +7,10 @@ from abc import ABC, abstractmethod
7
7
  from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
8
8
 
9
9
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
10
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
11
+ AbstractFileBasedStreamReader,
12
+ FileReadMode,
13
+ )
11
14
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
12
15
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
13
16
 
@@ -6,16 +6,24 @@ import json
6
6
  import logging
7
7
  from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
8
8
 
9
+ import orjson
10
+
9
11
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
12
  from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
11
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
13
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
14
+ AbstractFileBasedStreamReader,
15
+ FileReadMode,
16
+ )
12
17
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
13
18
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
14
- from airbyte_cdk.sources.file_based.schema_helpers import PYTHON_TYPE_MAPPING, SchemaType, merge_schemas
19
+ from airbyte_cdk.sources.file_based.schema_helpers import (
20
+ PYTHON_TYPE_MAPPING,
21
+ SchemaType,
22
+ merge_schemas,
23
+ )
15
24
 
16
25
 
17
26
  class JsonlParser(FileTypeParser):
18
-
19
27
  MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000
20
28
  ENCODING = "utf8"
21
29
 
@@ -100,18 +108,24 @@ class JsonlParser(FileTypeParser):
100
108
  read_bytes += len(line)
101
109
  accumulator += line # type: ignore [operator] # In reality, it's either bytes or string and we add the same type
102
110
  try:
103
- record = json.loads(accumulator)
111
+ record = orjson.loads(accumulator)
104
112
  if had_json_parsing_error and not has_warned_for_multiline_json_object:
105
- logger.warning(f"File at {file.uri} is using multiline JSON. Performance could be greatly reduced")
113
+ logger.warning(
114
+ f"File at {file.uri} is using multiline JSON. Performance could be greatly reduced"
115
+ )
106
116
  has_warned_for_multiline_json_object = True
107
117
 
108
118
  yield record
109
119
  yielded_at_least_once = True
110
120
  accumulator = self._instantiate_accumulator(line)
111
- except json.JSONDecodeError:
121
+ except orjson.JSONDecodeError:
112
122
  had_json_parsing_error = True
113
123
 
114
- if read_limit and yielded_at_least_once and read_bytes >= self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE:
124
+ if (
125
+ read_limit
126
+ and yielded_at_least_once
127
+ and read_bytes >= self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE
128
+ ):
115
129
  logger.warning(
116
130
  f"Exceeded the maximum number of bytes per file for schema inference ({self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE}). "
117
131
  f"Inferring schema from an incomplete set of records."
@@ -119,7 +133,9 @@ class JsonlParser(FileTypeParser):
119
133
  break
120
134
 
121
135
  if had_json_parsing_error and not yielded_at_least_once:
122
- raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line)
136
+ raise RecordParseError(
137
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line
138
+ )
123
139
 
124
140
  @staticmethod
125
141
  def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:
@@ -10,17 +10,27 @@ from urllib.parse import unquote
10
10
 
11
11
  import pyarrow as pa
12
12
  import pyarrow.parquet as pq
13
- from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat
14
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
15
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
13
+ from pyarrow import DictionaryArray, Scalar
14
+
15
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
16
+ FileBasedStreamConfig,
17
+ ParquetFormat,
18
+ )
19
+ from airbyte_cdk.sources.file_based.exceptions import (
20
+ ConfigValidationError,
21
+ FileBasedSourceError,
22
+ RecordParseError,
23
+ )
24
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
25
+ AbstractFileBasedStreamReader,
26
+ FileReadMode,
27
+ )
16
28
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
17
29
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
18
30
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
19
- from pyarrow import DictionaryArray, Scalar
20
31
 
21
32
 
22
33
  class ParquetParser(FileTypeParser):
23
-
24
34
  ENCODING = None
25
35
 
26
36
  def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
@@ -45,9 +55,15 @@ class ParquetParser(FileTypeParser):
45
55
  parquet_schema = parquet_file.schema_arrow
46
56
 
47
57
  # Inferred non-partition schema
48
- schema = {field.name: ParquetParser.parquet_type_to_schema_type(field.type, parquet_format) for field in parquet_schema}
58
+ schema = {
59
+ field.name: ParquetParser.parquet_type_to_schema_type(field.type, parquet_format)
60
+ for field in parquet_schema
61
+ }
49
62
  # Inferred partition schema
50
- partition_columns = {partition.split("=")[0]: {"type": "string"} for partition in self._extract_partitions(file.uri)}
63
+ partition_columns = {
64
+ partition.split("=")[0]: {"type": "string"}
65
+ for partition in self._extract_partitions(file.uri)
66
+ }
51
67
 
52
68
  schema.update(partition_columns)
53
69
  return schema
@@ -69,21 +85,27 @@ class ParquetParser(FileTypeParser):
69
85
  try:
70
86
  with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
71
87
  reader = pq.ParquetFile(fp)
72
- partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
88
+ partition_columns = {
89
+ x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)
90
+ }
73
91
  for row_group in range(reader.num_row_groups):
74
92
  batch = reader.read_row_group(row_group)
75
93
  for row in range(batch.num_rows):
76
94
  line_no += 1
77
95
  yield {
78
96
  **{
79
- column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
97
+ column: ParquetParser._to_output_value(
98
+ batch.column(column)[row], parquet_format
99
+ )
80
100
  for column in batch.column_names
81
101
  },
82
102
  **partition_columns,
83
103
  }
84
104
  except Exception as exc:
85
105
  raise RecordParseError(
86
- FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=f"{row_group=}, {line_no=}"
106
+ FileBasedSourceError.ERROR_PARSING_RECORD,
107
+ filename=file.uri,
108
+ lineno=f"{row_group=}, {line_no=}",
87
109
  ) from exc
88
110
 
89
111
  @staticmethod
@@ -95,7 +117,9 @@ class ParquetParser(FileTypeParser):
95
117
  return FileReadMode.READ_BINARY
96
118
 
97
119
  @staticmethod
98
- def _to_output_value(parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat) -> Any:
120
+ def _to_output_value(
121
+ parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat
122
+ ) -> Any:
99
123
  """
100
124
  Convert an entry in a pyarrow table to a value that can be output by the source.
101
125
  """
@@ -113,7 +137,11 @@ class ParquetParser(FileTypeParser):
113
137
  return None
114
138
 
115
139
  # Convert date and datetime objects to isoformat strings
116
- if pa.types.is_time(parquet_value.type) or pa.types.is_timestamp(parquet_value.type) or pa.types.is_date(parquet_value.type):
140
+ if (
141
+ pa.types.is_time(parquet_value.type)
142
+ or pa.types.is_timestamp(parquet_value.type)
143
+ or pa.types.is_date(parquet_value.type)
144
+ ):
117
145
  return parquet_value.as_py().isoformat()
118
146
 
119
147
  # Convert month_day_nano_interval to array
@@ -126,7 +154,7 @@ class ParquetParser(FileTypeParser):
126
154
 
127
155
  if pa.types.is_decimal(parquet_value.type):
128
156
  if parquet_format.decimal_as_float:
129
- return parquet_value.as_py()
157
+ return float(parquet_value.as_py())
130
158
  else:
131
159
  return str(parquet_value.as_py())
132
160
 
@@ -168,7 +196,9 @@ class ParquetParser(FileTypeParser):
168
196
  }
169
197
 
170
198
  @staticmethod
171
- def parquet_type_to_schema_type(parquet_type: pa.DataType, parquet_format: ParquetFormat) -> Mapping[str, str]:
199
+ def parquet_type_to_schema_type(
200
+ parquet_type: pa.DataType, parquet_format: ParquetFormat
201
+ ) -> Mapping[str, str]:
172
202
  """
173
203
  Convert a pyarrow data type to an Airbyte schema type.
174
204
  Parquet data types are defined at https://arrow.apache.org/docs/python/api/datatypes.html
@@ -198,7 +228,9 @@ class ParquetParser(FileTypeParser):
198
228
  @staticmethod
199
229
  def _is_binary(parquet_type: pa.DataType) -> bool:
200
230
  return bool(
201
- pa.types.is_binary(parquet_type) or pa.types.is_large_binary(parquet_type) or pa.types.is_fixed_size_binary(parquet_type)
231
+ pa.types.is_binary(parquet_type)
232
+ or pa.types.is_large_binary(parquet_type)
233
+ or pa.types.is_fixed_size_binary(parquet_type)
202
234
  )
203
235
 
204
236
  @staticmethod
@@ -221,13 +253,23 @@ class ParquetParser(FileTypeParser):
221
253
  pa.types.is_time(parquet_type)
222
254
  or pa.types.is_string(parquet_type)
223
255
  or pa.types.is_large_string(parquet_type)
224
- or ParquetParser._is_binary(parquet_type) # Best we can do is return as a string since we do not support binary
256
+ or ParquetParser._is_binary(
257
+ parquet_type
258
+ ) # Best we can do is return as a string since we do not support binary
225
259
  )
226
260
 
227
261
  @staticmethod
228
262
  def _is_object(parquet_type: pa.DataType) -> bool:
229
- return bool(pa.types.is_dictionary(parquet_type) or pa.types.is_struct(parquet_type) or pa.types.is_map(parquet_type))
263
+ return bool(
264
+ pa.types.is_dictionary(parquet_type)
265
+ or pa.types.is_struct(parquet_type)
266
+ or pa.types.is_map(parquet_type)
267
+ )
230
268
 
231
269
  @staticmethod
232
270
  def _is_list(parquet_type: pa.DataType) -> bool:
233
- return bool(pa.types.is_list(parquet_type) or pa.types.is_large_list(parquet_type) or parquet_type == pa.month_day_nano_interval())
271
+ return bool(
272
+ pa.types.is_list(parquet_type)
273
+ or pa.types.is_large_list(parquet_type)
274
+ or parquet_type == pa.month_day_nano_interval()
275
+ )
@@ -2,14 +2,24 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
+ import os
5
6
  import traceback
6
7
  from datetime import datetime
7
8
  from io import BytesIO, IOBase
8
9
  from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
9
10
 
10
11
  import backoff
11
- import dpath.util
12
+ import dpath
13
+ import nltk
12
14
  import requests
15
+ from unstructured.file_utils.filetype import (
16
+ EXT_TO_FILETYPE,
17
+ FILETYPE_TO_MIMETYPE,
18
+ STR_TO_FILETYPE,
19
+ FileType,
20
+ detect_filetype,
21
+ )
22
+
13
23
  from airbyte_cdk.models import FailureType
14
24
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
15
25
  from airbyte_cdk.sources.file_based.config.unstructured_format import (
@@ -19,17 +29,31 @@ from airbyte_cdk.sources.file_based.config.unstructured_format import (
19
29
  UnstructuredFormat,
20
30
  )
21
31
  from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
22
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
32
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
33
+ AbstractFileBasedStreamReader,
34
+ FileReadMode,
35
+ )
23
36
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
24
37
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
25
38
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
26
39
  from airbyte_cdk.utils import is_cloud_environment
27
40
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
28
- from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, STR_TO_FILETYPE, FileType, detect_filetype
29
41
 
30
42
  unstructured_partition_pdf = None
31
43
  unstructured_partition_docx = None
32
44
  unstructured_partition_pptx = None
45
+ nltk_data_dir = "/tmp/nltk_data"
46
+
47
+ try:
48
+ os.makedirs(nltk_data_dir, exist_ok=True)
49
+ nltk.data.path.append(nltk_data_dir)
50
+ nltk.data.find("tokenizers/punkt.zip")
51
+ nltk.data.find("tokenizers/punkt_tab.zip")
52
+ nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
53
+ except LookupError:
54
+ nltk.download("punkt", download_dir=nltk_data_dir)
55
+ nltk.download("punkt_tab", download_dir=nltk_data_dir)
56
+ nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir)
33
57
 
34
58
 
35
59
  def optional_decode(contents: Union[str, bytes]) -> str:
@@ -100,16 +124,21 @@ class UnstructuredParser(FileTypeParser):
100
124
  format = _extract_format(config)
101
125
  with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
102
126
  filetype = self._get_filetype(file_handle, file)
103
-
104
127
  if filetype not in self._supported_file_types() and not format.skip_unprocessable_files:
105
- raise self._create_parse_error(file, self._get_file_type_error_message(filetype))
128
+ raise self._create_parse_error(
129
+ file,
130
+ self._get_file_type_error_message(filetype),
131
+ )
106
132
 
107
133
  return {
108
134
  "content": {
109
135
  "type": "string",
110
136
  "description": "Content of the file as markdown. Might be null if the file could not be parsed",
111
137
  },
112
- "document_key": {"type": "string", "description": "Unique identifier of the document, e.g. the file path"},
138
+ "document_key": {
139
+ "type": "string",
140
+ "description": "Unique identifier of the document, e.g. the file path",
141
+ },
113
142
  "_ab_source_file_parse_error": {
114
143
  "type": "string",
115
144
  "description": "Error message if the file could not be parsed even though the file is supported",
@@ -148,26 +177,54 @@ class UnstructuredParser(FileTypeParser):
148
177
  logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
149
178
  else:
150
179
  raise e
180
+ except Exception as e:
181
+ exception_str = str(e)
182
+ logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
183
+ raise e
151
184
 
152
- def _read_file(self, file_handle: IOBase, remote_file: RemoteFile, format: UnstructuredFormat, logger: logging.Logger) -> str:
185
+ def _read_file(
186
+ self,
187
+ file_handle: IOBase,
188
+ remote_file: RemoteFile,
189
+ format: UnstructuredFormat,
190
+ logger: logging.Logger,
191
+ ) -> str:
153
192
  _import_unstructured()
154
- if (not unstructured_partition_pdf) or (not unstructured_partition_docx) or (not unstructured_partition_pptx):
193
+ if (
194
+ (not unstructured_partition_pdf)
195
+ or (not unstructured_partition_docx)
196
+ or (not unstructured_partition_pptx)
197
+ ):
155
198
  # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
156
199
  raise Exception("unstructured library is not available")
157
200
 
158
- filetype = self._get_filetype(file_handle, remote_file)
201
+ filetype: FileType | None = self._get_filetype(file_handle, remote_file)
159
202
 
160
- if filetype == FileType.MD or filetype == FileType.TXT:
203
+ if filetype is None or filetype not in self._supported_file_types():
204
+ raise self._create_parse_error(
205
+ remote_file,
206
+ self._get_file_type_error_message(filetype),
207
+ )
208
+ if filetype in {FileType.MD, FileType.TXT}:
161
209
  file_content: bytes = file_handle.read()
162
210
  decoded_content: str = optional_decode(file_content)
163
211
  return decoded_content
164
- if filetype not in self._supported_file_types():
165
- raise self._create_parse_error(remote_file, self._get_file_type_error_message(filetype))
166
212
  if format.processing.mode == "local":
167
- return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
213
+ return self._read_file_locally(
214
+ file_handle,
215
+ filetype,
216
+ format.strategy,
217
+ remote_file,
218
+ )
168
219
  elif format.processing.mode == "api":
169
220
  try:
170
- result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy, remote_file)
221
+ result: str = self._read_file_remotely_with_retries(
222
+ file_handle,
223
+ format.processing,
224
+ filetype,
225
+ format.strategy,
226
+ remote_file,
227
+ )
171
228
  except Exception as e:
172
229
  # If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
173
230
  #
@@ -175,11 +232,15 @@ class UnstructuredParser(FileTypeParser):
175
232
  # Once this parser leaves experimental stage, we should consider making this a system error instead for issues that might be transient.
176
233
  if isinstance(e, RecordParseError):
177
234
  raise e
178
- raise AirbyteTracedException.from_exception(e, failure_type=FailureType.config_error)
235
+ raise AirbyteTracedException.from_exception(
236
+ e, failure_type=FailureType.config_error
237
+ )
179
238
 
180
239
  return result
181
240
 
182
- def _params_to_dict(self, params: Optional[List[APIParameterConfigModel]], strategy: str) -> Dict[str, Union[str, List[str]]]:
241
+ def _params_to_dict(
242
+ self, params: Optional[List[APIParameterConfigModel]], strategy: str
243
+ ) -> Dict[str, Union[str, List[str]]]:
183
244
  result_dict: Dict[str, Union[str, List[str]]] = {"strategy": strategy}
184
245
  if params is None:
185
246
  return result_dict
@@ -229,9 +290,16 @@ class UnstructuredParser(FileTypeParser):
229
290
 
230
291
  return True, None
231
292
 
232
- @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error)
293
+ @backoff.on_exception(
294
+ backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error
295
+ )
233
296
  def _read_file_remotely_with_retries(
234
- self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
297
+ self,
298
+ file_handle: IOBase,
299
+ format: APIProcessingConfigModel,
300
+ filetype: FileType,
301
+ strategy: str,
302
+ remote_file: RemoteFile,
235
303
  ) -> str:
236
304
  """
237
305
  Read a file remotely, retrying up to 5 times if the error is not caused by user error. This is useful for transient network errors or the API server being overloaded temporarily.
@@ -239,7 +307,12 @@ class UnstructuredParser(FileTypeParser):
239
307
  return self._read_file_remotely(file_handle, format, filetype, strategy, remote_file)
240
308
 
241
309
  def _read_file_remotely(
242
- self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
310
+ self,
311
+ file_handle: IOBase,
312
+ format: APIProcessingConfigModel,
313
+ filetype: FileType,
314
+ strategy: str,
315
+ remote_file: RemoteFile,
243
316
  ) -> str:
244
317
  headers = {"accept": "application/json", "unstructured-api-key": format.api_key}
245
318
 
@@ -247,7 +320,9 @@ class UnstructuredParser(FileTypeParser):
247
320
 
248
321
  file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
249
322
 
250
- response = requests.post(f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data)
323
+ response = requests.post(
324
+ f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data
325
+ )
251
326
 
252
327
  if response.status_code == 422:
253
328
  # 422 means the file couldn't be processed, but the API is working. Treat this as a parsing error (passing an error record to the destination).
@@ -260,9 +335,15 @@ class UnstructuredParser(FileTypeParser):
260
335
 
261
336
  return self._render_markdown(json_response)
262
337
 
263
- def _read_file_locally(self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile) -> str:
338
+ def _read_file_locally(
339
+ self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile
340
+ ) -> str:
264
341
  _import_unstructured()
265
- if (not unstructured_partition_pdf) or (not unstructured_partition_docx) or (not unstructured_partition_pptx):
342
+ if (
343
+ (not unstructured_partition_pdf)
344
+ or (not unstructured_partition_docx)
345
+ or (not unstructured_partition_pptx)
346
+ ):
266
347
  # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
267
348
  raise Exception("unstructured library is not available")
268
349
 
@@ -289,8 +370,14 @@ class UnstructuredParser(FileTypeParser):
289
370
 
290
371
  return self._render_markdown([element.to_dict() for element in elements])
291
372
 
292
- def _create_parse_error(self, remote_file: RemoteFile, message: str) -> RecordParseError:
293
- return RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message)
373
+ def _create_parse_error(
374
+ self,
375
+ remote_file: RemoteFile,
376
+ message: str,
377
+ ) -> RecordParseError:
378
+ return RecordParseError(
379
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
380
+ )
294
381
 
295
382
  def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileType]:
296
383
  """
@@ -311,39 +398,58 @@ class UnstructuredParser(FileTypeParser):
311
398
  # detect_filetype is either using the file name or file content
312
399
  # if possible, try to leverage the file name to detect the file type
313
400
  # if the file name is not available, use the file content
314
- file_type = detect_filetype(
315
- filename=remote_file.uri,
316
- )
317
- if file_type is not None and not file_type == FileType.UNK:
401
+ file_type: FileType | None = None
402
+ try:
403
+ file_type = detect_filetype(
404
+ filename=remote_file.uri,
405
+ )
406
+ except Exception:
407
+ # Path doesn't exist locally. Try something else...
408
+ pass
409
+
410
+ if file_type and file_type != FileType.UNK:
318
411
  return file_type
319
412
 
320
413
  type_based_on_content = detect_filetype(file=file)
414
+ file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
321
415
 
322
- # detect_filetype is reading to read the file content
323
- file.seek(0)
416
+ if type_based_on_content and type_based_on_content != FileType.UNK:
417
+ return type_based_on_content
324
418
 
325
- return type_based_on_content
419
+ extension = "." + remote_file.uri.split(".")[-1].lower()
420
+ if extension in EXT_TO_FILETYPE:
421
+ return EXT_TO_FILETYPE[extension]
422
+
423
+ return None
326
424
 
327
425
  def _supported_file_types(self) -> List[Any]:
328
426
  return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
329
427
 
330
- def _get_file_type_error_message(self, file_type: FileType) -> str:
428
+ def _get_file_type_error_message(
429
+ self,
430
+ file_type: FileType | None,
431
+ ) -> str:
331
432
  supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
332
- return f"File type {file_type} is not supported. Supported file types are {supported_file_types}"
433
+ return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"
333
434
 
334
435
  def _render_markdown(self, elements: List[Any]) -> str:
335
436
  return "\n\n".join((self._convert_to_markdown(el) for el in elements))
336
437
 
337
438
  def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
338
- if dpath.util.get(el, "type") == "Title":
339
- heading_str = "#" * (dpath.util.get(el, "metadata/category_depth", default=1) or 1)
340
- return f"{heading_str} {dpath.util.get(el, 'text')}"
341
- elif dpath.util.get(el, "type") == "ListItem":
342
- return f"- {dpath.util.get(el, 'text')}"
343
- elif dpath.util.get(el, "type") == "Formula":
344
- return f"```\n{dpath.util.get(el, 'text')}\n```"
439
+ if dpath.get(el, "type") == "Title":
440
+ category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
441
+ if not isinstance(category_depth, int):
442
+ category_depth = (
443
+ int(category_depth) if isinstance(category_depth, (str, float)) else 1
444
+ )
445
+ heading_str = "#" * category_depth
446
+ return f"{heading_str} {dpath.get(el, 'text')}"
447
+ elif dpath.get(el, "type") == "ListItem":
448
+ return f"- {dpath.get(el, 'text')}"
449
+ elif dpath.get(el, "type") == "Formula":
450
+ return f"```\n{dpath.get(el, 'text')}\n```"
345
451
  else:
346
- return str(dpath.util.get(el, "text", default=""))
452
+ return str(dpath.get(el, "text", default=""))
347
453
 
348
454
  @property
349
455
  def file_read_mode(self) -> FileReadMode:
@@ -5,7 +5,7 @@
5
5
  from datetime import datetime
6
6
  from typing import Optional
7
7
 
8
- from pydantic import BaseModel
8
+ from pydantic.v1 import BaseModel
9
9
 
10
10
 
11
11
  class RemoteFile(BaseModel):