airbyte-cdk 0.72.1__py3-none-any.whl → 6.17.1.dev1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (518) hide show
  1. airbyte_cdk/__init__.py +355 -6
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +29 -10
  7. airbyte_cdk/connector.py +24 -24
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
  10. airbyte_cdk/connector_builder/main.py +45 -13
  11. airbyte_cdk/connector_builder/message_grouper.py +189 -50
  12. airbyte_cdk/connector_builder/models.py +3 -2
  13. airbyte_cdk/destinations/__init__.py +4 -3
  14. airbyte_cdk/destinations/destination.py +54 -20
  15. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  16. airbyte_cdk/destinations/vector_db_based/config.py +40 -17
  17. airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
  18. airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
  19. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  20. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  21. airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
  22. airbyte_cdk/entrypoint.py +153 -44
  23. airbyte_cdk/exception_handler.py +21 -3
  24. airbyte_cdk/logger.py +30 -44
  25. airbyte_cdk/models/__init__.py +13 -2
  26. airbyte_cdk/models/airbyte_protocol.py +86 -1
  27. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  28. airbyte_cdk/models/file_transfer_record_message.py +13 -0
  29. airbyte_cdk/models/well_known_types.py +1 -1
  30. airbyte_cdk/sources/__init__.py +5 -1
  31. airbyte_cdk/sources/abstract_source.py +125 -79
  32. airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
  33. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
  34. airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
  35. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
  36. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  37. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
  38. airbyte_cdk/sources/config.py +3 -2
  39. airbyte_cdk/sources/connector_state_manager.py +49 -83
  40. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  41. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
  42. airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
  43. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  44. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  45. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  46. airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
  47. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  48. airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
  49. airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
  50. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
  51. airbyte_cdk/sources/declarative/auth/token.py +28 -10
  52. airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
  53. airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
  54. airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
  55. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  56. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  57. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +490 -0
  58. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
  59. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
  60. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1185 -85
  61. airbyte_cdk/sources/declarative/declarative_source.py +5 -2
  62. airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
  63. airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
  64. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  65. airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
  66. airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
  67. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  68. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  69. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  70. airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
  71. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
  72. airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
  73. airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
  74. airbyte_cdk/sources/declarative/extractors/record_filter.py +63 -8
  75. airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
  76. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
  77. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  78. airbyte_cdk/sources/declarative/incremental/__init__.py +31 -3
  79. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +340 -0
  80. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
  81. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  82. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
  83. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +174 -74
  84. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  85. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  86. airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
  87. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
  88. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
  89. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
  90. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
  91. airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
  92. airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
  93. airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
  94. airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
  95. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  96. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  97. airbyte_cdk/sources/declarative/models/__init__.py +1 -1
  98. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1319 -603
  99. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
  100. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
  101. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
  102. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1759 -225
  103. airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
  104. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  105. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  106. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
  107. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  108. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
  109. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
  110. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  111. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  112. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
  113. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
  114. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
  115. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
  116. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
  117. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
  118. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
  119. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
  120. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  121. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
  122. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
  123. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
  124. airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
  125. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  126. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
  127. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
  128. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
  129. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  130. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
  131. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
  132. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
  133. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
  134. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
  135. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
  136. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
  137. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  138. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
  139. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
  140. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
  141. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
  142. airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
  143. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  144. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  145. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  146. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  147. airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
  148. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
  149. airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
  150. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +229 -73
  151. airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
  152. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
  153. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
  154. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
  155. airbyte_cdk/sources/declarative/spec/spec.py +12 -5
  156. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
  157. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
  158. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
  159. airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
  160. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  161. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  162. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  163. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  164. airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
  165. airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
  166. airbyte_cdk/sources/declarative/types.py +19 -110
  167. airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
  168. airbyte_cdk/sources/embedded/base_integration.py +16 -5
  169. airbyte_cdk/sources/embedded/catalog.py +16 -4
  170. airbyte_cdk/sources/embedded/runner.py +19 -3
  171. airbyte_cdk/sources/embedded/tools.py +5 -2
  172. airbyte_cdk/sources/file_based/README.md +152 -0
  173. airbyte_cdk/sources/file_based/__init__.py +24 -0
  174. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  175. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
  176. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
  177. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +47 -10
  178. airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
  179. airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
  180. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  181. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
  182. airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
  183. airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
  184. airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
  185. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  186. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  187. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  188. airbyte_cdk/sources/file_based/exceptions.py +18 -15
  189. airbyte_cdk/sources/file_based/file_based_source.py +140 -33
  190. airbyte_cdk/sources/file_based/file_based_stream_reader.py +69 -5
  191. airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
  192. airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
  193. airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
  194. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  195. airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
  196. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  197. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
  198. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
  199. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +141 -41
  200. airbyte_cdk/sources/file_based/remote_file.py +1 -1
  201. airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
  202. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  203. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  204. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  205. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
  206. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
  207. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  208. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
  209. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
  210. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
  211. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  212. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
  213. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +147 -45
  214. airbyte_cdk/sources/http_logger.py +8 -3
  215. airbyte_cdk/sources/message/__init__.py +7 -1
  216. airbyte_cdk/sources/message/repository.py +18 -4
  217. airbyte_cdk/sources/source.py +42 -38
  218. airbyte_cdk/sources/streams/__init__.py +2 -2
  219. airbyte_cdk/sources/streams/availability_strategy.py +54 -3
  220. airbyte_cdk/sources/streams/call_rate.py +64 -21
  221. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  222. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  223. airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
  224. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  225. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  226. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  227. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  228. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
  229. airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
  230. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
  231. airbyte_cdk/sources/streams/concurrent/cursor.py +313 -48
  232. airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
  233. airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
  234. airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
  235. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
  236. airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
  237. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
  238. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  239. airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
  240. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
  241. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
  242. airbyte_cdk/sources/streams/core.py +412 -87
  243. airbyte_cdk/sources/streams/http/__init__.py +2 -1
  244. airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
  245. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  246. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  247. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  248. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  249. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  250. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  251. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  252. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  253. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  254. airbyte_cdk/sources/streams/http/exceptions.py +27 -7
  255. airbyte_cdk/sources/streams/http/http.py +369 -246
  256. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  257. airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
  258. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
  259. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  260. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
  261. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  262. airbyte_cdk/sources/types.py +154 -0
  263. airbyte_cdk/sources/utils/record_helper.py +36 -21
  264. airbyte_cdk/sources/utils/schema_helpers.py +13 -6
  265. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  266. airbyte_cdk/sources/utils/transform.py +54 -20
  267. airbyte_cdk/sql/_util/hashing.py +34 -0
  268. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  269. airbyte_cdk/sql/constants.py +32 -0
  270. airbyte_cdk/sql/exceptions.py +235 -0
  271. airbyte_cdk/sql/secrets.py +123 -0
  272. airbyte_cdk/sql/shared/__init__.py +15 -0
  273. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  274. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  275. airbyte_cdk/sql/types.py +160 -0
  276. airbyte_cdk/test/catalog_builder.py +70 -18
  277. airbyte_cdk/test/entrypoint_wrapper.py +117 -42
  278. airbyte_cdk/test/mock_http/__init__.py +1 -1
  279. airbyte_cdk/test/mock_http/matcher.py +6 -0
  280. airbyte_cdk/test/mock_http/mocker.py +57 -10
  281. airbyte_cdk/test/mock_http/request.py +19 -3
  282. airbyte_cdk/test/mock_http/response.py +3 -1
  283. airbyte_cdk/test/mock_http/response_builder.py +32 -16
  284. airbyte_cdk/test/state_builder.py +18 -10
  285. airbyte_cdk/test/utils/__init__.py +1 -0
  286. airbyte_cdk/test/utils/data.py +24 -0
  287. airbyte_cdk/test/utils/http_mocking.py +16 -0
  288. airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
  289. airbyte_cdk/test/utils/reading.py +26 -0
  290. airbyte_cdk/utils/__init__.py +2 -1
  291. airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
  292. airbyte_cdk/utils/analytics_message.py +10 -2
  293. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  294. airbyte_cdk/utils/event_timing.py +10 -10
  295. airbyte_cdk/utils/mapping_helpers.py +3 -1
  296. airbyte_cdk/utils/message_utils.py +20 -11
  297. airbyte_cdk/utils/print_buffer.py +75 -0
  298. airbyte_cdk/utils/schema_inferrer.py +198 -28
  299. airbyte_cdk/utils/slice_hasher.py +30 -0
  300. airbyte_cdk/utils/spec_schema_transformations.py +6 -3
  301. airbyte_cdk/utils/stream_status_utils.py +8 -1
  302. airbyte_cdk/utils/traced_exception.py +61 -21
  303. airbyte_cdk-6.17.1.dev1.dist-info/METADATA +109 -0
  304. airbyte_cdk-6.17.1.dev1.dist-info/RECORD +350 -0
  305. {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.17.1.dev1.dist-info}/WHEEL +1 -2
  306. airbyte_cdk-6.17.1.dev1.dist-info/entry_points.txt +3 -0
  307. airbyte_cdk/sources/declarative/create_partial.py +0 -92
  308. airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
  309. airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
  310. airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
  311. airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
  312. airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
  313. airbyte_cdk/sources/deprecated/base_source.py +0 -94
  314. airbyte_cdk/sources/deprecated/client.py +0 -99
  315. airbyte_cdk/sources/singer/__init__.py +0 -8
  316. airbyte_cdk/sources/singer/singer_helpers.py +0 -304
  317. airbyte_cdk/sources/singer/source.py +0 -186
  318. airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
  319. airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
  320. airbyte_cdk/sources/streams/http/auth/core.py +0 -29
  321. airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
  322. airbyte_cdk/sources/streams/http/auth/token.py +0 -47
  323. airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
  324. airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
  325. airbyte_cdk/sources/utils/schema_models.py +0 -84
  326. airbyte_cdk-0.72.1.dist-info/METADATA +0 -243
  327. airbyte_cdk-0.72.1.dist-info/RECORD +0 -466
  328. airbyte_cdk-0.72.1.dist-info/top_level.txt +0 -3
  329. source_declarative_manifest/main.py +0 -29
  330. unit_tests/connector_builder/__init__.py +0 -3
  331. unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
  332. unit_tests/connector_builder/test_message_grouper.py +0 -713
  333. unit_tests/connector_builder/utils.py +0 -27
  334. unit_tests/destinations/test_destination.py +0 -243
  335. unit_tests/singer/test_singer_helpers.py +0 -56
  336. unit_tests/singer/test_singer_source.py +0 -112
  337. unit_tests/sources/__init__.py +0 -0
  338. unit_tests/sources/concurrent_source/__init__.py +0 -3
  339. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
  340. unit_tests/sources/declarative/__init__.py +0 -3
  341. unit_tests/sources/declarative/auth/__init__.py +0 -3
  342. unit_tests/sources/declarative/auth/test_oauth.py +0 -331
  343. unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
  344. unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
  345. unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
  346. unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
  347. unit_tests/sources/declarative/checks/__init__.py +0 -3
  348. unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
  349. unit_tests/sources/declarative/decoders/__init__.py +0 -0
  350. unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
  351. unit_tests/sources/declarative/external_component.py +0 -13
  352. unit_tests/sources/declarative/extractors/__init__.py +0 -3
  353. unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
  354. unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
  355. unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
  356. unit_tests/sources/declarative/incremental/__init__.py +0 -0
  357. unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
  358. unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
  359. unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
  360. unit_tests/sources/declarative/interpolation/__init__.py +0 -3
  361. unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
  362. unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
  363. unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
  364. unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
  365. unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
  366. unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
  367. unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
  368. unit_tests/sources/declarative/parsers/__init__.py +0 -3
  369. unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
  370. unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
  371. unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1847
  372. unit_tests/sources/declarative/parsers/testing_components.py +0 -36
  373. unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
  374. unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
  375. unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
  376. unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
  377. unit_tests/sources/declarative/requesters/__init__.py +0 -3
  378. unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
  379. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
  380. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
  381. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
  382. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
  383. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
  384. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
  385. unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
  386. unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
  387. unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
  388. unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
  389. unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
  390. unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
  391. unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
  392. unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
  393. unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
  394. unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
  395. unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
  396. unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
  397. unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
  398. unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
  399. unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
  400. unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
  401. unit_tests/sources/declarative/retrievers/__init__.py +0 -3
  402. unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
  403. unit_tests/sources/declarative/schema/__init__.py +0 -6
  404. unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
  405. unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
  406. unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
  407. unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
  408. unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
  409. unit_tests/sources/declarative/states/__init__.py +0 -3
  410. unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
  411. unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
  412. unit_tests/sources/declarative/test_create_partial.py +0 -83
  413. unit_tests/sources/declarative/test_declarative_stream.py +0 -103
  414. unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
  415. unit_tests/sources/declarative/test_types.py +0 -39
  416. unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
  417. unit_tests/sources/file_based/__init__.py +0 -0
  418. unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
  419. unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
  420. unit_tests/sources/file_based/config/__init__.py +0 -0
  421. unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
  422. unit_tests/sources/file_based/config/test_csv_format.py +0 -34
  423. unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
  424. unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
  425. unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
  426. unit_tests/sources/file_based/file_types/__init__.py +0 -0
  427. unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
  428. unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
  429. unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
  430. unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
  431. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
  432. unit_tests/sources/file_based/helpers.py +0 -70
  433. unit_tests/sources/file_based/in_memory_files_source.py +0 -211
  434. unit_tests/sources/file_based/scenarios/__init__.py +0 -0
  435. unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
  436. unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
  437. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
  438. unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
  439. unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
  440. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
  441. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
  442. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
  443. unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
  444. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
  445. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
  446. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
  447. unit_tests/sources/file_based/stream/__init__.py +0 -0
  448. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  449. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
  450. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
  451. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
  452. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
  453. unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
  454. unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
  455. unit_tests/sources/file_based/test_scenarios.py +0 -253
  456. unit_tests/sources/file_based/test_schema_helpers.py +0 -346
  457. unit_tests/sources/fixtures/__init__.py +0 -3
  458. unit_tests/sources/fixtures/source_test_fixture.py +0 -153
  459. unit_tests/sources/message/__init__.py +0 -0
  460. unit_tests/sources/message/test_repository.py +0 -153
  461. unit_tests/sources/streams/__init__.py +0 -0
  462. unit_tests/sources/streams/concurrent/__init__.py +0 -3
  463. unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
  464. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
  465. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
  466. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
  467. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
  468. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
  469. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
  470. unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
  471. unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
  472. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
  473. unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
  474. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
  475. unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
  476. unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
  477. unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
  478. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
  479. unit_tests/sources/streams/http/__init__.py +0 -0
  480. unit_tests/sources/streams/http/auth/__init__.py +0 -0
  481. unit_tests/sources/streams/http/auth/test_auth.py +0 -173
  482. unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
  483. unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
  484. unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
  485. unit_tests/sources/streams/http/test_http.py +0 -635
  486. unit_tests/sources/streams/test_availability_strategy.py +0 -70
  487. unit_tests/sources/streams/test_call_rate.py +0 -300
  488. unit_tests/sources/streams/test_stream_read.py +0 -405
  489. unit_tests/sources/streams/test_streams_core.py +0 -184
  490. unit_tests/sources/test_abstract_source.py +0 -1442
  491. unit_tests/sources/test_concurrent_source.py +0 -112
  492. unit_tests/sources/test_config.py +0 -92
  493. unit_tests/sources/test_connector_state_manager.py +0 -482
  494. unit_tests/sources/test_http_logger.py +0 -252
  495. unit_tests/sources/test_integration_source.py +0 -86
  496. unit_tests/sources/test_source.py +0 -684
  497. unit_tests/sources/test_source_read.py +0 -460
  498. unit_tests/test/__init__.py +0 -0
  499. unit_tests/test/mock_http/__init__.py +0 -0
  500. unit_tests/test/mock_http/test_matcher.py +0 -53
  501. unit_tests/test/mock_http/test_mocker.py +0 -214
  502. unit_tests/test/mock_http/test_request.py +0 -117
  503. unit_tests/test/mock_http/test_response_builder.py +0 -177
  504. unit_tests/test/test_entrypoint_wrapper.py +0 -240
  505. unit_tests/utils/__init__.py +0 -0
  506. unit_tests/utils/test_datetime_format_inferrer.py +0 -60
  507. unit_tests/utils/test_mapping_helpers.py +0 -54
  508. unit_tests/utils/test_message_utils.py +0 -91
  509. unit_tests/utils/test_rate_limiting.py +0 -26
  510. unit_tests/utils/test_schema_inferrer.py +0 -202
  511. unit_tests/utils/test_secret_utils.py +0 -135
  512. unit_tests/utils/test_stream_status_utils.py +0 -61
  513. unit_tests/utils/test_traced_exception.py +0 -107
  514. /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
  515. {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
  516. {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
  517. {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
  518. {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.17.1.dev1.dist-info}/LICENSE.txt +0 -0
@@ -0,0 +1,37 @@
1
+ #
2
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3
+ #
4
+ import logging
5
+ import os
6
+ from typing import Any, Dict, Iterable
7
+
8
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
9
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
10
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
+
12
+ AIRBYTE_STAGING_DIRECTORY = os.getenv("AIRBYTE_STAGING_DIRECTORY", "/staging/files")
13
+ DEFAULT_LOCAL_DIRECTORY = "/tmp/airbyte-file-transfer"
14
+
15
+
16
+ class FileTransfer:
17
+ def __init__(self) -> None:
18
+ self._local_directory = (
19
+ AIRBYTE_STAGING_DIRECTORY
20
+ if os.path.exists(AIRBYTE_STAGING_DIRECTORY)
21
+ else DEFAULT_LOCAL_DIRECTORY
22
+ )
23
+
24
+ def get_file(
25
+ self,
26
+ config: FileBasedStreamConfig,
27
+ file: RemoteFile,
28
+ stream_reader: AbstractFileBasedStreamReader,
29
+ logger: logging.Logger,
30
+ ) -> Iterable[Dict[str, Any]]:
31
+ try:
32
+ yield stream_reader.get_file(
33
+ file=file, local_directory=self._local_directory, logger=logger
34
+ )
35
+ except Exception as ex:
36
+ logger.error("An error has occurred while getting file: %s", str(ex))
37
+ raise ex
@@ -7,7 +7,10 @@ from abc import ABC, abstractmethod
7
7
  from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
8
8
 
9
9
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
10
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
11
+ AbstractFileBasedStreamReader,
12
+ FileReadMode,
13
+ )
11
14
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
12
15
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
13
16
 
@@ -6,16 +6,24 @@ import json
6
6
  import logging
7
7
  from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
8
8
 
9
+ import orjson
10
+
9
11
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
12
  from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
11
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
13
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
14
+ AbstractFileBasedStreamReader,
15
+ FileReadMode,
16
+ )
12
17
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
13
18
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
14
- from airbyte_cdk.sources.file_based.schema_helpers import PYTHON_TYPE_MAPPING, SchemaType, merge_schemas
19
+ from airbyte_cdk.sources.file_based.schema_helpers import (
20
+ PYTHON_TYPE_MAPPING,
21
+ SchemaType,
22
+ merge_schemas,
23
+ )
15
24
 
16
25
 
17
26
  class JsonlParser(FileTypeParser):
18
-
19
27
  MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000
20
28
  ENCODING = "utf8"
21
29
 
@@ -100,18 +108,24 @@ class JsonlParser(FileTypeParser):
100
108
  read_bytes += len(line)
101
109
  accumulator += line # type: ignore [operator] # In reality, it's either bytes or string and we add the same type
102
110
  try:
103
- record = json.loads(accumulator)
111
+ record = orjson.loads(accumulator)
104
112
  if had_json_parsing_error and not has_warned_for_multiline_json_object:
105
- logger.warning(f"File at {file.uri} is using multiline JSON. Performance could be greatly reduced")
113
+ logger.warning(
114
+ f"File at {file.uri} is using multiline JSON. Performance could be greatly reduced"
115
+ )
106
116
  has_warned_for_multiline_json_object = True
107
117
 
108
118
  yield record
109
119
  yielded_at_least_once = True
110
120
  accumulator = self._instantiate_accumulator(line)
111
- except json.JSONDecodeError:
121
+ except orjson.JSONDecodeError:
112
122
  had_json_parsing_error = True
113
123
 
114
- if read_limit and yielded_at_least_once and read_bytes >= self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE:
124
+ if (
125
+ read_limit
126
+ and yielded_at_least_once
127
+ and read_bytes >= self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE
128
+ ):
115
129
  logger.warning(
116
130
  f"Exceeded the maximum number of bytes per file for schema inference ({self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE}). "
117
131
  f"Inferring schema from an incomplete set of records."
@@ -119,7 +133,9 @@ class JsonlParser(FileTypeParser):
119
133
  break
120
134
 
121
135
  if had_json_parsing_error and not yielded_at_least_once:
122
- raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line)
136
+ raise RecordParseError(
137
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line
138
+ )
123
139
 
124
140
  @staticmethod
125
141
  def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:
@@ -10,17 +10,27 @@ from urllib.parse import unquote
10
10
 
11
11
  import pyarrow as pa
12
12
  import pyarrow.parquet as pq
13
- from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat
14
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
15
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
13
+ from pyarrow import DictionaryArray, Scalar
14
+
15
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
16
+ FileBasedStreamConfig,
17
+ ParquetFormat,
18
+ )
19
+ from airbyte_cdk.sources.file_based.exceptions import (
20
+ ConfigValidationError,
21
+ FileBasedSourceError,
22
+ RecordParseError,
23
+ )
24
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
25
+ AbstractFileBasedStreamReader,
26
+ FileReadMode,
27
+ )
16
28
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
17
29
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
18
30
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
19
- from pyarrow import DictionaryArray, Scalar
20
31
 
21
32
 
22
33
  class ParquetParser(FileTypeParser):
23
-
24
34
  ENCODING = None
25
35
 
26
36
  def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
@@ -45,9 +55,15 @@ class ParquetParser(FileTypeParser):
45
55
  parquet_schema = parquet_file.schema_arrow
46
56
 
47
57
  # Inferred non-partition schema
48
- schema = {field.name: ParquetParser.parquet_type_to_schema_type(field.type, parquet_format) for field in parquet_schema}
58
+ schema = {
59
+ field.name: ParquetParser.parquet_type_to_schema_type(field.type, parquet_format)
60
+ for field in parquet_schema
61
+ }
49
62
  # Inferred partition schema
50
- partition_columns = {partition.split("=")[0]: {"type": "string"} for partition in self._extract_partitions(file.uri)}
63
+ partition_columns = {
64
+ partition.split("=")[0]: {"type": "string"}
65
+ for partition in self._extract_partitions(file.uri)
66
+ }
51
67
 
52
68
  schema.update(partition_columns)
53
69
  return schema
@@ -69,21 +85,27 @@ class ParquetParser(FileTypeParser):
69
85
  try:
70
86
  with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
71
87
  reader = pq.ParquetFile(fp)
72
- partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
88
+ partition_columns = {
89
+ x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)
90
+ }
73
91
  for row_group in range(reader.num_row_groups):
74
92
  batch = reader.read_row_group(row_group)
75
93
  for row in range(batch.num_rows):
76
94
  line_no += 1
77
95
  yield {
78
96
  **{
79
- column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
97
+ column: ParquetParser._to_output_value(
98
+ batch.column(column)[row], parquet_format
99
+ )
80
100
  for column in batch.column_names
81
101
  },
82
102
  **partition_columns,
83
103
  }
84
104
  except Exception as exc:
85
105
  raise RecordParseError(
86
- FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=f"{row_group=}, {line_no=}"
106
+ FileBasedSourceError.ERROR_PARSING_RECORD,
107
+ filename=file.uri,
108
+ lineno=f"{row_group=}, {line_no=}",
87
109
  ) from exc
88
110
 
89
111
  @staticmethod
@@ -95,7 +117,9 @@ class ParquetParser(FileTypeParser):
95
117
  return FileReadMode.READ_BINARY
96
118
 
97
119
  @staticmethod
98
- def _to_output_value(parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat) -> Any:
120
+ def _to_output_value(
121
+ parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat
122
+ ) -> Any:
99
123
  """
100
124
  Convert an entry in a pyarrow table to a value that can be output by the source.
101
125
  """
@@ -113,7 +137,11 @@ class ParquetParser(FileTypeParser):
113
137
  return None
114
138
 
115
139
  # Convert date and datetime objects to isoformat strings
116
- if pa.types.is_time(parquet_value.type) or pa.types.is_timestamp(parquet_value.type) or pa.types.is_date(parquet_value.type):
140
+ if (
141
+ pa.types.is_time(parquet_value.type)
142
+ or pa.types.is_timestamp(parquet_value.type)
143
+ or pa.types.is_date(parquet_value.type)
144
+ ):
117
145
  return parquet_value.as_py().isoformat()
118
146
 
119
147
  # Convert month_day_nano_interval to array
@@ -126,7 +154,7 @@ class ParquetParser(FileTypeParser):
126
154
 
127
155
  if pa.types.is_decimal(parquet_value.type):
128
156
  if parquet_format.decimal_as_float:
129
- return parquet_value.as_py()
157
+ return float(parquet_value.as_py())
130
158
  else:
131
159
  return str(parquet_value.as_py())
132
160
 
@@ -168,7 +196,9 @@ class ParquetParser(FileTypeParser):
168
196
  }
169
197
 
170
198
  @staticmethod
171
- def parquet_type_to_schema_type(parquet_type: pa.DataType, parquet_format: ParquetFormat) -> Mapping[str, str]:
199
+ def parquet_type_to_schema_type(
200
+ parquet_type: pa.DataType, parquet_format: ParquetFormat
201
+ ) -> Mapping[str, str]:
172
202
  """
173
203
  Convert a pyarrow data type to an Airbyte schema type.
174
204
  Parquet data types are defined at https://arrow.apache.org/docs/python/api/datatypes.html
@@ -198,7 +228,9 @@ class ParquetParser(FileTypeParser):
198
228
  @staticmethod
199
229
  def _is_binary(parquet_type: pa.DataType) -> bool:
200
230
  return bool(
201
- pa.types.is_binary(parquet_type) or pa.types.is_large_binary(parquet_type) or pa.types.is_fixed_size_binary(parquet_type)
231
+ pa.types.is_binary(parquet_type)
232
+ or pa.types.is_large_binary(parquet_type)
233
+ or pa.types.is_fixed_size_binary(parquet_type)
202
234
  )
203
235
 
204
236
  @staticmethod
@@ -221,13 +253,23 @@ class ParquetParser(FileTypeParser):
221
253
  pa.types.is_time(parquet_type)
222
254
  or pa.types.is_string(parquet_type)
223
255
  or pa.types.is_large_string(parquet_type)
224
- or ParquetParser._is_binary(parquet_type) # Best we can do is return as a string since we do not support binary
256
+ or ParquetParser._is_binary(
257
+ parquet_type
258
+ ) # Best we can do is return as a string since we do not support binary
225
259
  )
226
260
 
227
261
  @staticmethod
228
262
  def _is_object(parquet_type: pa.DataType) -> bool:
229
- return bool(pa.types.is_dictionary(parquet_type) or pa.types.is_struct(parquet_type) or pa.types.is_map(parquet_type))
263
+ return bool(
264
+ pa.types.is_dictionary(parquet_type)
265
+ or pa.types.is_struct(parquet_type)
266
+ or pa.types.is_map(parquet_type)
267
+ )
230
268
 
231
269
  @staticmethod
232
270
  def _is_list(parquet_type: pa.DataType) -> bool:
233
- return bool(pa.types.is_list(parquet_type) or pa.types.is_large_list(parquet_type) or parquet_type == pa.month_day_nano_interval())
271
+ return bool(
272
+ pa.types.is_list(parquet_type)
273
+ or pa.types.is_large_list(parquet_type)
274
+ or parquet_type == pa.month_day_nano_interval()
275
+ )
@@ -8,8 +8,17 @@ from io import BytesIO, IOBase
8
8
  from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
9
9
 
10
10
  import backoff
11
- import dpath.util
11
+ import dpath
12
+ import nltk
12
13
  import requests
14
+ from unstructured.file_utils.filetype import (
15
+ EXT_TO_FILETYPE,
16
+ FILETYPE_TO_MIMETYPE,
17
+ STR_TO_FILETYPE,
18
+ FileType,
19
+ detect_filetype,
20
+ )
21
+
13
22
  from airbyte_cdk.models import FailureType
14
23
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
15
24
  from airbyte_cdk.sources.file_based.config.unstructured_format import (
@@ -19,18 +28,27 @@ from airbyte_cdk.sources.file_based.config.unstructured_format import (
19
28
  UnstructuredFormat,
20
29
  )
21
30
  from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
22
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
31
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
32
+ AbstractFileBasedStreamReader,
33
+ FileReadMode,
34
+ )
23
35
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
24
36
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
25
37
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
26
38
  from airbyte_cdk.utils import is_cloud_environment
27
39
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
28
- from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, STR_TO_FILETYPE, FileType, detect_filetype
29
40
 
30
41
  unstructured_partition_pdf = None
31
42
  unstructured_partition_docx = None
32
43
  unstructured_partition_pptx = None
33
44
 
45
+ try:
46
+ nltk.data.find("tokenizers/punkt.zip")
47
+ nltk.data.find("tokenizers/punkt_tab.zip")
48
+ except LookupError:
49
+ nltk.download("punkt")
50
+ nltk.download("punkt_tab")
51
+
34
52
 
35
53
  def optional_decode(contents: Union[str, bytes]) -> str:
36
54
  if isinstance(contents, bytes):
@@ -100,16 +118,21 @@ class UnstructuredParser(FileTypeParser):
100
118
  format = _extract_format(config)
101
119
  with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
102
120
  filetype = self._get_filetype(file_handle, file)
103
-
104
121
  if filetype not in self._supported_file_types() and not format.skip_unprocessable_files:
105
- raise self._create_parse_error(file, self._get_file_type_error_message(filetype))
122
+ raise self._create_parse_error(
123
+ file,
124
+ self._get_file_type_error_message(filetype),
125
+ )
106
126
 
107
127
  return {
108
128
  "content": {
109
129
  "type": "string",
110
130
  "description": "Content of the file as markdown. Might be null if the file could not be parsed",
111
131
  },
112
- "document_key": {"type": "string", "description": "Unique identifier of the document, e.g. the file path"},
132
+ "document_key": {
133
+ "type": "string",
134
+ "description": "Unique identifier of the document, e.g. the file path",
135
+ },
113
136
  "_ab_source_file_parse_error": {
114
137
  "type": "string",
115
138
  "description": "Error message if the file could not be parsed even though the file is supported",
@@ -148,26 +171,54 @@ class UnstructuredParser(FileTypeParser):
148
171
  logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
149
172
  else:
150
173
  raise e
174
+ except Exception as e:
175
+ exception_str = str(e)
176
+ logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
177
+ raise e
151
178
 
152
- def _read_file(self, file_handle: IOBase, remote_file: RemoteFile, format: UnstructuredFormat, logger: logging.Logger) -> str:
179
+ def _read_file(
180
+ self,
181
+ file_handle: IOBase,
182
+ remote_file: RemoteFile,
183
+ format: UnstructuredFormat,
184
+ logger: logging.Logger,
185
+ ) -> str:
153
186
  _import_unstructured()
154
- if (not unstructured_partition_pdf) or (not unstructured_partition_docx) or (not unstructured_partition_pptx):
187
+ if (
188
+ (not unstructured_partition_pdf)
189
+ or (not unstructured_partition_docx)
190
+ or (not unstructured_partition_pptx)
191
+ ):
155
192
  # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
156
193
  raise Exception("unstructured library is not available")
157
194
 
158
- filetype = self._get_filetype(file_handle, remote_file)
195
+ filetype: FileType | None = self._get_filetype(file_handle, remote_file)
159
196
 
160
- if filetype == FileType.MD or filetype == FileType.TXT:
197
+ if filetype is None or filetype not in self._supported_file_types():
198
+ raise self._create_parse_error(
199
+ remote_file,
200
+ self._get_file_type_error_message(filetype),
201
+ )
202
+ if filetype in {FileType.MD, FileType.TXT}:
161
203
  file_content: bytes = file_handle.read()
162
204
  decoded_content: str = optional_decode(file_content)
163
205
  return decoded_content
164
- if filetype not in self._supported_file_types():
165
- raise self._create_parse_error(remote_file, self._get_file_type_error_message(filetype))
166
206
  if format.processing.mode == "local":
167
- return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
207
+ return self._read_file_locally(
208
+ file_handle,
209
+ filetype,
210
+ format.strategy,
211
+ remote_file,
212
+ )
168
213
  elif format.processing.mode == "api":
169
214
  try:
170
- result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy, remote_file)
215
+ result: str = self._read_file_remotely_with_retries(
216
+ file_handle,
217
+ format.processing,
218
+ filetype,
219
+ format.strategy,
220
+ remote_file,
221
+ )
171
222
  except Exception as e:
172
223
  # If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
173
224
  #
@@ -175,11 +226,15 @@ class UnstructuredParser(FileTypeParser):
175
226
  # Once this parser leaves experimental stage, we should consider making this a system error instead for issues that might be transient.
176
227
  if isinstance(e, RecordParseError):
177
228
  raise e
178
- raise AirbyteTracedException.from_exception(e, failure_type=FailureType.config_error)
229
+ raise AirbyteTracedException.from_exception(
230
+ e, failure_type=FailureType.config_error
231
+ )
179
232
 
180
233
  return result
181
234
 
182
- def _params_to_dict(self, params: Optional[List[APIParameterConfigModel]], strategy: str) -> Dict[str, Union[str, List[str]]]:
235
+ def _params_to_dict(
236
+ self, params: Optional[List[APIParameterConfigModel]], strategy: str
237
+ ) -> Dict[str, Union[str, List[str]]]:
183
238
  result_dict: Dict[str, Union[str, List[str]]] = {"strategy": strategy}
184
239
  if params is None:
185
240
  return result_dict
@@ -229,9 +284,16 @@ class UnstructuredParser(FileTypeParser):
229
284
 
230
285
  return True, None
231
286
 
232
- @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error)
287
+ @backoff.on_exception(
288
+ backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error
289
+ )
233
290
  def _read_file_remotely_with_retries(
234
- self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
291
+ self,
292
+ file_handle: IOBase,
293
+ format: APIProcessingConfigModel,
294
+ filetype: FileType,
295
+ strategy: str,
296
+ remote_file: RemoteFile,
235
297
  ) -> str:
236
298
  """
237
299
  Read a file remotely, retrying up to 5 times if the error is not caused by user error. This is useful for transient network errors or the API server being overloaded temporarily.
@@ -239,7 +301,12 @@ class UnstructuredParser(FileTypeParser):
239
301
  return self._read_file_remotely(file_handle, format, filetype, strategy, remote_file)
240
302
 
241
303
  def _read_file_remotely(
242
- self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
304
+ self,
305
+ file_handle: IOBase,
306
+ format: APIProcessingConfigModel,
307
+ filetype: FileType,
308
+ strategy: str,
309
+ remote_file: RemoteFile,
243
310
  ) -> str:
244
311
  headers = {"accept": "application/json", "unstructured-api-key": format.api_key}
245
312
 
@@ -247,7 +314,9 @@ class UnstructuredParser(FileTypeParser):
247
314
 
248
315
  file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
249
316
 
250
- response = requests.post(f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data)
317
+ response = requests.post(
318
+ f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data
319
+ )
251
320
 
252
321
  if response.status_code == 422:
253
322
  # 422 means the file couldn't be processed, but the API is working. Treat this as a parsing error (passing an error record to the destination).
@@ -260,9 +329,15 @@ class UnstructuredParser(FileTypeParser):
260
329
 
261
330
  return self._render_markdown(json_response)
262
331
 
263
- def _read_file_locally(self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile) -> str:
332
+ def _read_file_locally(
333
+ self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile
334
+ ) -> str:
264
335
  _import_unstructured()
265
- if (not unstructured_partition_pdf) or (not unstructured_partition_docx) or (not unstructured_partition_pptx):
336
+ if (
337
+ (not unstructured_partition_pdf)
338
+ or (not unstructured_partition_docx)
339
+ or (not unstructured_partition_pptx)
340
+ ):
266
341
  # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
267
342
  raise Exception("unstructured library is not available")
268
343
 
@@ -289,8 +364,14 @@ class UnstructuredParser(FileTypeParser):
289
364
 
290
365
  return self._render_markdown([element.to_dict() for element in elements])
291
366
 
292
- def _create_parse_error(self, remote_file: RemoteFile, message: str) -> RecordParseError:
293
- return RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message)
367
+ def _create_parse_error(
368
+ self,
369
+ remote_file: RemoteFile,
370
+ message: str,
371
+ ) -> RecordParseError:
372
+ return RecordParseError(
373
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
374
+ )
294
375
 
295
376
  def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileType]:
296
377
  """
@@ -311,39 +392,58 @@ class UnstructuredParser(FileTypeParser):
311
392
  # detect_filetype is either using the file name or file content
312
393
  # if possible, try to leverage the file name to detect the file type
313
394
  # if the file name is not available, use the file content
314
- file_type = detect_filetype(
315
- filename=remote_file.uri,
316
- )
317
- if file_type is not None and not file_type == FileType.UNK:
395
+ file_type: FileType | None = None
396
+ try:
397
+ file_type = detect_filetype(
398
+ filename=remote_file.uri,
399
+ )
400
+ except Exception:
401
+ # Path doesn't exist locally. Try something else...
402
+ pass
403
+
404
+ if file_type and file_type != FileType.UNK:
318
405
  return file_type
319
406
 
320
407
  type_based_on_content = detect_filetype(file=file)
408
+ file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
321
409
 
322
- # detect_filetype is reading to read the file content
323
- file.seek(0)
410
+ if type_based_on_content and type_based_on_content != FileType.UNK:
411
+ return type_based_on_content
324
412
 
325
- return type_based_on_content
413
+ extension = "." + remote_file.uri.split(".")[-1].lower()
414
+ if extension in EXT_TO_FILETYPE:
415
+ return EXT_TO_FILETYPE[extension]
416
+
417
+ return None
326
418
 
327
419
  def _supported_file_types(self) -> List[Any]:
328
420
  return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
329
421
 
330
- def _get_file_type_error_message(self, file_type: FileType) -> str:
422
+ def _get_file_type_error_message(
423
+ self,
424
+ file_type: FileType | None,
425
+ ) -> str:
331
426
  supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
332
- return f"File type {file_type} is not supported. Supported file types are {supported_file_types}"
427
+ return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"
333
428
 
334
429
  def _render_markdown(self, elements: List[Any]) -> str:
335
430
  return "\n\n".join((self._convert_to_markdown(el) for el in elements))
336
431
 
337
432
  def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
338
- if dpath.util.get(el, "type") == "Title":
339
- heading_str = "#" * (dpath.util.get(el, "metadata/category_depth", default=1) or 1)
340
- return f"{heading_str} {dpath.util.get(el, 'text')}"
341
- elif dpath.util.get(el, "type") == "ListItem":
342
- return f"- {dpath.util.get(el, 'text')}"
343
- elif dpath.util.get(el, "type") == "Formula":
344
- return f"```\n{dpath.util.get(el, 'text')}\n```"
433
+ if dpath.get(el, "type") == "Title":
434
+ category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
435
+ if not isinstance(category_depth, int):
436
+ category_depth = (
437
+ int(category_depth) if isinstance(category_depth, (str, float)) else 1
438
+ )
439
+ heading_str = "#" * category_depth
440
+ return f"{heading_str} {dpath.get(el, 'text')}"
441
+ elif dpath.get(el, "type") == "ListItem":
442
+ return f"- {dpath.get(el, 'text')}"
443
+ elif dpath.get(el, "type") == "Formula":
444
+ return f"```\n{dpath.get(el, 'text')}\n```"
345
445
  else:
346
- return str(dpath.util.get(el, "text", default=""))
446
+ return str(dpath.get(el, "text", default=""))
347
447
 
348
448
  @property
349
449
  def file_read_mode(self) -> FileReadMode:
@@ -5,7 +5,7 @@
5
5
  from datetime import datetime
6
6
  from typing import Optional
7
7
 
8
- from pydantic import BaseModel
8
+ from pydantic.v1 import BaseModel
9
9
 
10
10
 
11
11
  class RemoteFile(BaseModel):