airbyte-cdk 0.72.0__py3-none-any.whl → 6.13.1.dev4106__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (517) hide show
  1. airbyte_cdk/__init__.py +355 -6
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +29 -10
  7. airbyte_cdk/connector.py +24 -24
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
  10. airbyte_cdk/connector_builder/main.py +45 -13
  11. airbyte_cdk/connector_builder/message_grouper.py +189 -50
  12. airbyte_cdk/connector_builder/models.py +3 -2
  13. airbyte_cdk/destinations/__init__.py +4 -3
  14. airbyte_cdk/destinations/destination.py +54 -20
  15. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  16. airbyte_cdk/destinations/vector_db_based/config.py +40 -17
  17. airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
  18. airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
  19. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  20. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  21. airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
  22. airbyte_cdk/entrypoint.py +153 -44
  23. airbyte_cdk/exception_handler.py +21 -3
  24. airbyte_cdk/logger.py +30 -44
  25. airbyte_cdk/models/__init__.py +13 -2
  26. airbyte_cdk/models/airbyte_protocol.py +86 -1
  27. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  28. airbyte_cdk/models/file_transfer_record_message.py +13 -0
  29. airbyte_cdk/models/well_known_types.py +1 -1
  30. airbyte_cdk/sources/__init__.py +5 -1
  31. airbyte_cdk/sources/abstract_source.py +125 -79
  32. airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
  33. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
  34. airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
  35. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
  36. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  37. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
  38. airbyte_cdk/sources/config.py +3 -2
  39. airbyte_cdk/sources/connector_state_manager.py +49 -83
  40. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  41. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
  42. airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
  43. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  44. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  45. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  46. airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
  47. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  48. airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
  49. airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
  50. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
  51. airbyte_cdk/sources/declarative/auth/token.py +28 -10
  52. airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
  53. airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
  54. airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
  55. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  56. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  57. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +421 -0
  58. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
  59. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
  60. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1213 -88
  61. airbyte_cdk/sources/declarative/declarative_source.py +5 -2
  62. airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
  63. airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
  64. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  65. airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
  66. airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
  67. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  68. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  69. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  70. airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
  71. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
  72. airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
  73. airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
  74. airbyte_cdk/sources/declarative/extractors/record_filter.py +65 -8
  75. airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
  76. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
  77. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  78. airbyte_cdk/sources/declarative/incremental/__init__.py +25 -3
  79. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
  80. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  81. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
  82. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +159 -74
  83. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  84. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  85. airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
  86. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
  87. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
  88. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
  89. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
  90. airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
  91. airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
  92. airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
  93. airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
  94. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  95. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  96. airbyte_cdk/sources/declarative/models/__init__.py +1 -1
  97. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1329 -595
  98. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
  99. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
  100. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
  101. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1699 -226
  102. airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
  103. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  104. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  105. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
  106. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  107. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
  108. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
  109. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  110. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  111. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
  112. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
  113. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
  114. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
  115. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
  116. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
  117. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
  118. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
  119. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  120. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
  121. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
  122. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
  123. airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
  124. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  125. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
  126. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
  127. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
  128. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  129. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
  130. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
  131. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
  132. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
  133. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
  134. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
  135. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
  136. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  137. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
  138. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
  139. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
  140. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
  141. airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
  142. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  143. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  144. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  145. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  146. airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
  147. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
  148. airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
  149. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +228 -72
  150. airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
  151. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
  152. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
  153. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
  154. airbyte_cdk/sources/declarative/spec/spec.py +12 -5
  155. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
  156. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
  157. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
  158. airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
  159. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  160. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  161. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  162. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  163. airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
  164. airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
  165. airbyte_cdk/sources/declarative/types.py +19 -110
  166. airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
  167. airbyte_cdk/sources/embedded/base_integration.py +16 -5
  168. airbyte_cdk/sources/embedded/catalog.py +16 -4
  169. airbyte_cdk/sources/embedded/runner.py +19 -3
  170. airbyte_cdk/sources/embedded/tools.py +5 -2
  171. airbyte_cdk/sources/file_based/README.md +152 -0
  172. airbyte_cdk/sources/file_based/__init__.py +24 -0
  173. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  174. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
  175. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
  176. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +58 -10
  177. airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
  178. airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
  179. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  180. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
  181. airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
  182. airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
  183. airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
  184. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  185. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  186. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  187. airbyte_cdk/sources/file_based/exceptions.py +52 -15
  188. airbyte_cdk/sources/file_based/file_based_source.py +163 -33
  189. airbyte_cdk/sources/file_based/file_based_stream_reader.py +83 -5
  190. airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
  191. airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
  192. airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
  193. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  194. airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
  195. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  196. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
  197. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
  198. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +145 -41
  199. airbyte_cdk/sources/file_based/remote_file.py +1 -1
  200. airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
  201. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  202. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  203. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  204. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
  205. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
  206. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  207. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
  208. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
  209. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
  210. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  211. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
  212. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +175 -45
  213. airbyte_cdk/sources/http_logger.py +8 -3
  214. airbyte_cdk/sources/message/__init__.py +7 -1
  215. airbyte_cdk/sources/message/repository.py +18 -4
  216. airbyte_cdk/sources/source.py +42 -38
  217. airbyte_cdk/sources/streams/__init__.py +2 -2
  218. airbyte_cdk/sources/streams/availability_strategy.py +54 -3
  219. airbyte_cdk/sources/streams/call_rate.py +64 -21
  220. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  221. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  222. airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
  223. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  224. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  225. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  226. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  227. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
  228. airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
  229. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
  230. airbyte_cdk/sources/streams/concurrent/cursor.py +298 -42
  231. airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
  232. airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
  233. airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
  234. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
  235. airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
  236. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
  237. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  238. airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
  239. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
  240. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
  241. airbyte_cdk/sources/streams/core.py +412 -87
  242. airbyte_cdk/sources/streams/http/__init__.py +2 -1
  243. airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
  244. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  245. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  246. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  247. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  248. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  249. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  250. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  251. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  252. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  253. airbyte_cdk/sources/streams/http/exceptions.py +27 -7
  254. airbyte_cdk/sources/streams/http/http.py +369 -246
  255. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  256. airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
  257. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
  258. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  259. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
  260. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  261. airbyte_cdk/sources/types.py +154 -0
  262. airbyte_cdk/sources/utils/record_helper.py +36 -21
  263. airbyte_cdk/sources/utils/schema_helpers.py +13 -6
  264. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  265. airbyte_cdk/sources/utils/transform.py +54 -20
  266. airbyte_cdk/sql/_util/hashing.py +34 -0
  267. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  268. airbyte_cdk/sql/constants.py +32 -0
  269. airbyte_cdk/sql/exceptions.py +235 -0
  270. airbyte_cdk/sql/secrets.py +123 -0
  271. airbyte_cdk/sql/shared/__init__.py +15 -0
  272. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  273. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  274. airbyte_cdk/sql/types.py +160 -0
  275. airbyte_cdk/test/catalog_builder.py +70 -18
  276. airbyte_cdk/test/entrypoint_wrapper.py +117 -42
  277. airbyte_cdk/test/mock_http/__init__.py +1 -1
  278. airbyte_cdk/test/mock_http/matcher.py +6 -0
  279. airbyte_cdk/test/mock_http/mocker.py +57 -10
  280. airbyte_cdk/test/mock_http/request.py +19 -3
  281. airbyte_cdk/test/mock_http/response.py +3 -1
  282. airbyte_cdk/test/mock_http/response_builder.py +32 -16
  283. airbyte_cdk/test/state_builder.py +18 -10
  284. airbyte_cdk/test/utils/__init__.py +1 -0
  285. airbyte_cdk/test/utils/data.py +24 -0
  286. airbyte_cdk/test/utils/http_mocking.py +16 -0
  287. airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
  288. airbyte_cdk/test/utils/reading.py +26 -0
  289. airbyte_cdk/utils/__init__.py +2 -1
  290. airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
  291. airbyte_cdk/utils/analytics_message.py +10 -2
  292. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  293. airbyte_cdk/utils/event_timing.py +10 -10
  294. airbyte_cdk/utils/mapping_helpers.py +3 -1
  295. airbyte_cdk/utils/message_utils.py +20 -11
  296. airbyte_cdk/utils/print_buffer.py +75 -0
  297. airbyte_cdk/utils/schema_inferrer.py +198 -28
  298. airbyte_cdk/utils/slice_hasher.py +30 -0
  299. airbyte_cdk/utils/spec_schema_transformations.py +6 -3
  300. airbyte_cdk/utils/stream_status_utils.py +8 -1
  301. airbyte_cdk/utils/traced_exception.py +61 -21
  302. airbyte_cdk-6.13.1.dev4106.dist-info/METADATA +109 -0
  303. airbyte_cdk-6.13.1.dev4106.dist-info/RECORD +349 -0
  304. {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/WHEEL +1 -2
  305. airbyte_cdk-6.13.1.dev4106.dist-info/entry_points.txt +3 -0
  306. airbyte_cdk/sources/declarative/create_partial.py +0 -92
  307. airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
  308. airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
  309. airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
  310. airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
  311. airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
  312. airbyte_cdk/sources/deprecated/base_source.py +0 -94
  313. airbyte_cdk/sources/deprecated/client.py +0 -99
  314. airbyte_cdk/sources/singer/__init__.py +0 -8
  315. airbyte_cdk/sources/singer/singer_helpers.py +0 -304
  316. airbyte_cdk/sources/singer/source.py +0 -186
  317. airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
  318. airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
  319. airbyte_cdk/sources/streams/http/auth/core.py +0 -29
  320. airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
  321. airbyte_cdk/sources/streams/http/auth/token.py +0 -47
  322. airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
  323. airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
  324. airbyte_cdk/sources/utils/schema_models.py +0 -84
  325. airbyte_cdk-0.72.0.dist-info/METADATA +0 -243
  326. airbyte_cdk-0.72.0.dist-info/RECORD +0 -466
  327. airbyte_cdk-0.72.0.dist-info/top_level.txt +0 -3
  328. source_declarative_manifest/main.py +0 -29
  329. unit_tests/connector_builder/__init__.py +0 -3
  330. unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
  331. unit_tests/connector_builder/test_message_grouper.py +0 -713
  332. unit_tests/connector_builder/utils.py +0 -27
  333. unit_tests/destinations/test_destination.py +0 -243
  334. unit_tests/singer/test_singer_helpers.py +0 -56
  335. unit_tests/singer/test_singer_source.py +0 -112
  336. unit_tests/sources/__init__.py +0 -0
  337. unit_tests/sources/concurrent_source/__init__.py +0 -3
  338. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
  339. unit_tests/sources/declarative/__init__.py +0 -3
  340. unit_tests/sources/declarative/auth/__init__.py +0 -3
  341. unit_tests/sources/declarative/auth/test_oauth.py +0 -331
  342. unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
  343. unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
  344. unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
  345. unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
  346. unit_tests/sources/declarative/checks/__init__.py +0 -3
  347. unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
  348. unit_tests/sources/declarative/decoders/__init__.py +0 -0
  349. unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
  350. unit_tests/sources/declarative/external_component.py +0 -13
  351. unit_tests/sources/declarative/extractors/__init__.py +0 -3
  352. unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
  353. unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
  354. unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
  355. unit_tests/sources/declarative/incremental/__init__.py +0 -0
  356. unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
  357. unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
  358. unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
  359. unit_tests/sources/declarative/interpolation/__init__.py +0 -3
  360. unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
  361. unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
  362. unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
  363. unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
  364. unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
  365. unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
  366. unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
  367. unit_tests/sources/declarative/parsers/__init__.py +0 -3
  368. unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
  369. unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
  370. unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1841
  371. unit_tests/sources/declarative/parsers/testing_components.py +0 -36
  372. unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
  373. unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
  374. unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
  375. unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
  376. unit_tests/sources/declarative/requesters/__init__.py +0 -3
  377. unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
  378. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
  379. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
  380. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
  381. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
  382. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
  383. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
  384. unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
  385. unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
  386. unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
  387. unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
  388. unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
  389. unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
  390. unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
  391. unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
  392. unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
  393. unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
  394. unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
  395. unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
  396. unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
  397. unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
  398. unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
  399. unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
  400. unit_tests/sources/declarative/retrievers/__init__.py +0 -3
  401. unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
  402. unit_tests/sources/declarative/schema/__init__.py +0 -6
  403. unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
  404. unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
  405. unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
  406. unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
  407. unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
  408. unit_tests/sources/declarative/states/__init__.py +0 -3
  409. unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
  410. unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
  411. unit_tests/sources/declarative/test_create_partial.py +0 -83
  412. unit_tests/sources/declarative/test_declarative_stream.py +0 -103
  413. unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
  414. unit_tests/sources/declarative/test_types.py +0 -39
  415. unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
  416. unit_tests/sources/file_based/__init__.py +0 -0
  417. unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
  418. unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
  419. unit_tests/sources/file_based/config/__init__.py +0 -0
  420. unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
  421. unit_tests/sources/file_based/config/test_csv_format.py +0 -34
  422. unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
  423. unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
  424. unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
  425. unit_tests/sources/file_based/file_types/__init__.py +0 -0
  426. unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
  427. unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
  428. unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
  429. unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
  430. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
  431. unit_tests/sources/file_based/helpers.py +0 -70
  432. unit_tests/sources/file_based/in_memory_files_source.py +0 -211
  433. unit_tests/sources/file_based/scenarios/__init__.py +0 -0
  434. unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
  435. unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
  436. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
  437. unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
  438. unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
  439. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
  440. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
  441. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
  442. unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
  443. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
  444. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
  445. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
  446. unit_tests/sources/file_based/stream/__init__.py +0 -0
  447. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  448. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
  449. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
  450. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
  451. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
  452. unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
  453. unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
  454. unit_tests/sources/file_based/test_scenarios.py +0 -253
  455. unit_tests/sources/file_based/test_schema_helpers.py +0 -346
  456. unit_tests/sources/fixtures/__init__.py +0 -3
  457. unit_tests/sources/fixtures/source_test_fixture.py +0 -153
  458. unit_tests/sources/message/__init__.py +0 -0
  459. unit_tests/sources/message/test_repository.py +0 -153
  460. unit_tests/sources/streams/__init__.py +0 -0
  461. unit_tests/sources/streams/concurrent/__init__.py +0 -3
  462. unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
  463. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
  464. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
  465. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
  466. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
  467. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
  468. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
  469. unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
  470. unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
  471. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
  472. unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
  473. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
  474. unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
  475. unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
  476. unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
  477. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
  478. unit_tests/sources/streams/http/__init__.py +0 -0
  479. unit_tests/sources/streams/http/auth/__init__.py +0 -0
  480. unit_tests/sources/streams/http/auth/test_auth.py +0 -173
  481. unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
  482. unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
  483. unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
  484. unit_tests/sources/streams/http/test_http.py +0 -635
  485. unit_tests/sources/streams/test_availability_strategy.py +0 -70
  486. unit_tests/sources/streams/test_call_rate.py +0 -300
  487. unit_tests/sources/streams/test_stream_read.py +0 -405
  488. unit_tests/sources/streams/test_streams_core.py +0 -184
  489. unit_tests/sources/test_abstract_source.py +0 -1442
  490. unit_tests/sources/test_concurrent_source.py +0 -112
  491. unit_tests/sources/test_config.py +0 -92
  492. unit_tests/sources/test_connector_state_manager.py +0 -482
  493. unit_tests/sources/test_http_logger.py +0 -252
  494. unit_tests/sources/test_integration_source.py +0 -86
  495. unit_tests/sources/test_source.py +0 -684
  496. unit_tests/sources/test_source_read.py +0 -460
  497. unit_tests/test/__init__.py +0 -0
  498. unit_tests/test/mock_http/__init__.py +0 -0
  499. unit_tests/test/mock_http/test_matcher.py +0 -53
  500. unit_tests/test/mock_http/test_mocker.py +0 -214
  501. unit_tests/test/mock_http/test_request.py +0 -117
  502. unit_tests/test/mock_http/test_response_builder.py +0 -177
  503. unit_tests/test/test_entrypoint_wrapper.py +0 -240
  504. unit_tests/utils/__init__.py +0 -0
  505. unit_tests/utils/test_datetime_format_inferrer.py +0 -60
  506. unit_tests/utils/test_mapping_helpers.py +0 -54
  507. unit_tests/utils/test_message_utils.py +0 -91
  508. unit_tests/utils/test_rate_limiting.py +0 -26
  509. unit_tests/utils/test_schema_inferrer.py +0 -202
  510. unit_tests/utils/test_secret_utils.py +0 -135
  511. unit_tests/utils/test_stream_status_utils.py +0 -61
  512. unit_tests/utils/test_traced_exception.py +0 -107
  513. /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
  514. {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
  515. {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
  516. {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
  517. {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/LICENSE.txt +0 -0
@@ -0,0 +1,37 @@
1
+ #
2
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3
+ #
4
+ import logging
5
+ import os
6
+ from typing import Any, Dict, Iterable
7
+
8
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
9
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
10
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
+
12
+ AIRBYTE_STAGING_DIRECTORY = os.getenv("AIRBYTE_STAGING_DIRECTORY", "/staging/files")
13
+ DEFAULT_LOCAL_DIRECTORY = "/tmp/airbyte-file-transfer"
14
+
15
+
16
+ class FileTransfer:
17
+ def __init__(self) -> None:
18
+ self._local_directory = (
19
+ AIRBYTE_STAGING_DIRECTORY
20
+ if os.path.exists(AIRBYTE_STAGING_DIRECTORY)
21
+ else DEFAULT_LOCAL_DIRECTORY
22
+ )
23
+
24
+ def get_file(
25
+ self,
26
+ config: FileBasedStreamConfig,
27
+ file: RemoteFile,
28
+ stream_reader: AbstractFileBasedStreamReader,
29
+ logger: logging.Logger,
30
+ ) -> Iterable[Dict[str, Any]]:
31
+ try:
32
+ yield stream_reader.get_file(
33
+ file=file, local_directory=self._local_directory, logger=logger
34
+ )
35
+ except Exception as ex:
36
+ logger.error("An error has occurred while getting file: %s", str(ex))
37
+ raise ex
@@ -7,7 +7,10 @@ from abc import ABC, abstractmethod
7
7
  from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
8
8
 
9
9
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
10
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
11
+ AbstractFileBasedStreamReader,
12
+ FileReadMode,
13
+ )
11
14
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
12
15
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
13
16
 
@@ -6,16 +6,24 @@ import json
6
6
  import logging
7
7
  from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
8
8
 
9
+ import orjson
10
+
9
11
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
12
  from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
11
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
13
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
14
+ AbstractFileBasedStreamReader,
15
+ FileReadMode,
16
+ )
12
17
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
13
18
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
14
- from airbyte_cdk.sources.file_based.schema_helpers import PYTHON_TYPE_MAPPING, SchemaType, merge_schemas
19
+ from airbyte_cdk.sources.file_based.schema_helpers import (
20
+ PYTHON_TYPE_MAPPING,
21
+ SchemaType,
22
+ merge_schemas,
23
+ )
15
24
 
16
25
 
17
26
  class JsonlParser(FileTypeParser):
18
-
19
27
  MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000
20
28
  ENCODING = "utf8"
21
29
 
@@ -100,18 +108,24 @@ class JsonlParser(FileTypeParser):
100
108
  read_bytes += len(line)
101
109
  accumulator += line # type: ignore [operator] # In reality, it's either bytes or string and we add the same type
102
110
  try:
103
- record = json.loads(accumulator)
111
+ record = orjson.loads(accumulator)
104
112
  if had_json_parsing_error and not has_warned_for_multiline_json_object:
105
- logger.warning(f"File at {file.uri} is using multiline JSON. Performance could be greatly reduced")
113
+ logger.warning(
114
+ f"File at {file.uri} is using multiline JSON. Performance could be greatly reduced"
115
+ )
106
116
  has_warned_for_multiline_json_object = True
107
117
 
108
118
  yield record
109
119
  yielded_at_least_once = True
110
120
  accumulator = self._instantiate_accumulator(line)
111
- except json.JSONDecodeError:
121
+ except orjson.JSONDecodeError:
112
122
  had_json_parsing_error = True
113
123
 
114
- if read_limit and yielded_at_least_once and read_bytes >= self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE:
124
+ if (
125
+ read_limit
126
+ and yielded_at_least_once
127
+ and read_bytes >= self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE
128
+ ):
115
129
  logger.warning(
116
130
  f"Exceeded the maximum number of bytes per file for schema inference ({self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE}). "
117
131
  f"Inferring schema from an incomplete set of records."
@@ -119,7 +133,9 @@ class JsonlParser(FileTypeParser):
119
133
  break
120
134
 
121
135
  if had_json_parsing_error and not yielded_at_least_once:
122
- raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line)
136
+ raise RecordParseError(
137
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line
138
+ )
123
139
 
124
140
  @staticmethod
125
141
  def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:
@@ -10,17 +10,27 @@ from urllib.parse import unquote
10
10
 
11
11
  import pyarrow as pa
12
12
  import pyarrow.parquet as pq
13
- from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat
14
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
15
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
13
+ from pyarrow import DictionaryArray, Scalar
14
+
15
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
16
+ FileBasedStreamConfig,
17
+ ParquetFormat,
18
+ )
19
+ from airbyte_cdk.sources.file_based.exceptions import (
20
+ ConfigValidationError,
21
+ FileBasedSourceError,
22
+ RecordParseError,
23
+ )
24
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
25
+ AbstractFileBasedStreamReader,
26
+ FileReadMode,
27
+ )
16
28
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
17
29
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
18
30
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
19
- from pyarrow import DictionaryArray, Scalar
20
31
 
21
32
 
22
33
  class ParquetParser(FileTypeParser):
23
-
24
34
  ENCODING = None
25
35
 
26
36
  def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
@@ -45,9 +55,15 @@ class ParquetParser(FileTypeParser):
45
55
  parquet_schema = parquet_file.schema_arrow
46
56
 
47
57
  # Inferred non-partition schema
48
- schema = {field.name: ParquetParser.parquet_type_to_schema_type(field.type, parquet_format) for field in parquet_schema}
58
+ schema = {
59
+ field.name: ParquetParser.parquet_type_to_schema_type(field.type, parquet_format)
60
+ for field in parquet_schema
61
+ }
49
62
  # Inferred partition schema
50
- partition_columns = {partition.split("=")[0]: {"type": "string"} for partition in self._extract_partitions(file.uri)}
63
+ partition_columns = {
64
+ partition.split("=")[0]: {"type": "string"}
65
+ for partition in self._extract_partitions(file.uri)
66
+ }
51
67
 
52
68
  schema.update(partition_columns)
53
69
  return schema
@@ -69,21 +85,27 @@ class ParquetParser(FileTypeParser):
69
85
  try:
70
86
  with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
71
87
  reader = pq.ParquetFile(fp)
72
- partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
88
+ partition_columns = {
89
+ x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)
90
+ }
73
91
  for row_group in range(reader.num_row_groups):
74
92
  batch = reader.read_row_group(row_group)
75
93
  for row in range(batch.num_rows):
76
94
  line_no += 1
77
95
  yield {
78
96
  **{
79
- column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
97
+ column: ParquetParser._to_output_value(
98
+ batch.column(column)[row], parquet_format
99
+ )
80
100
  for column in batch.column_names
81
101
  },
82
102
  **partition_columns,
83
103
  }
84
104
  except Exception as exc:
85
105
  raise RecordParseError(
86
- FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=f"{row_group=}, {line_no=}"
106
+ FileBasedSourceError.ERROR_PARSING_RECORD,
107
+ filename=file.uri,
108
+ lineno=f"{row_group=}, {line_no=}",
87
109
  ) from exc
88
110
 
89
111
  @staticmethod
@@ -95,7 +117,9 @@ class ParquetParser(FileTypeParser):
95
117
  return FileReadMode.READ_BINARY
96
118
 
97
119
  @staticmethod
98
- def _to_output_value(parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat) -> Any:
120
+ def _to_output_value(
121
+ parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat
122
+ ) -> Any:
99
123
  """
100
124
  Convert an entry in a pyarrow table to a value that can be output by the source.
101
125
  """
@@ -113,7 +137,11 @@ class ParquetParser(FileTypeParser):
113
137
  return None
114
138
 
115
139
  # Convert date and datetime objects to isoformat strings
116
- if pa.types.is_time(parquet_value.type) or pa.types.is_timestamp(parquet_value.type) or pa.types.is_date(parquet_value.type):
140
+ if (
141
+ pa.types.is_time(parquet_value.type)
142
+ or pa.types.is_timestamp(parquet_value.type)
143
+ or pa.types.is_date(parquet_value.type)
144
+ ):
117
145
  return parquet_value.as_py().isoformat()
118
146
 
119
147
  # Convert month_day_nano_interval to array
@@ -126,7 +154,7 @@ class ParquetParser(FileTypeParser):
126
154
 
127
155
  if pa.types.is_decimal(parquet_value.type):
128
156
  if parquet_format.decimal_as_float:
129
- return parquet_value.as_py()
157
+ return float(parquet_value.as_py())
130
158
  else:
131
159
  return str(parquet_value.as_py())
132
160
 
@@ -168,7 +196,9 @@ class ParquetParser(FileTypeParser):
168
196
  }
169
197
 
170
198
  @staticmethod
171
- def parquet_type_to_schema_type(parquet_type: pa.DataType, parquet_format: ParquetFormat) -> Mapping[str, str]:
199
+ def parquet_type_to_schema_type(
200
+ parquet_type: pa.DataType, parquet_format: ParquetFormat
201
+ ) -> Mapping[str, str]:
172
202
  """
173
203
  Convert a pyarrow data type to an Airbyte schema type.
174
204
  Parquet data types are defined at https://arrow.apache.org/docs/python/api/datatypes.html
@@ -198,7 +228,9 @@ class ParquetParser(FileTypeParser):
198
228
  @staticmethod
199
229
  def _is_binary(parquet_type: pa.DataType) -> bool:
200
230
  return bool(
201
- pa.types.is_binary(parquet_type) or pa.types.is_large_binary(parquet_type) or pa.types.is_fixed_size_binary(parquet_type)
231
+ pa.types.is_binary(parquet_type)
232
+ or pa.types.is_large_binary(parquet_type)
233
+ or pa.types.is_fixed_size_binary(parquet_type)
202
234
  )
203
235
 
204
236
  @staticmethod
@@ -221,13 +253,23 @@ class ParquetParser(FileTypeParser):
221
253
  pa.types.is_time(parquet_type)
222
254
  or pa.types.is_string(parquet_type)
223
255
  or pa.types.is_large_string(parquet_type)
224
- or ParquetParser._is_binary(parquet_type) # Best we can do is return as a string since we do not support binary
256
+ or ParquetParser._is_binary(
257
+ parquet_type
258
+ ) # Best we can do is return as a string since we do not support binary
225
259
  )
226
260
 
227
261
  @staticmethod
228
262
  def _is_object(parquet_type: pa.DataType) -> bool:
229
- return bool(pa.types.is_dictionary(parquet_type) or pa.types.is_struct(parquet_type) or pa.types.is_map(parquet_type))
263
+ return bool(
264
+ pa.types.is_dictionary(parquet_type)
265
+ or pa.types.is_struct(parquet_type)
266
+ or pa.types.is_map(parquet_type)
267
+ )
230
268
 
231
269
  @staticmethod
232
270
  def _is_list(parquet_type: pa.DataType) -> bool:
233
- return bool(pa.types.is_list(parquet_type) or pa.types.is_large_list(parquet_type) or parquet_type == pa.month_day_nano_interval())
271
+ return bool(
272
+ pa.types.is_list(parquet_type)
273
+ or pa.types.is_large_list(parquet_type)
274
+ or parquet_type == pa.month_day_nano_interval()
275
+ )
@@ -2,14 +2,24 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
+ import os
5
6
  import traceback
6
7
  from datetime import datetime
7
8
  from io import BytesIO, IOBase
8
9
  from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
9
10
 
10
11
  import backoff
11
- import dpath.util
12
+ import dpath
13
+ import nltk
12
14
  import requests
15
+ from unstructured.file_utils.filetype import (
16
+ EXT_TO_FILETYPE,
17
+ FILETYPE_TO_MIMETYPE,
18
+ STR_TO_FILETYPE,
19
+ FileType,
20
+ detect_filetype,
21
+ )
22
+
13
23
  from airbyte_cdk.models import FailureType
14
24
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
15
25
  from airbyte_cdk.sources.file_based.config.unstructured_format import (
@@ -19,17 +29,29 @@ from airbyte_cdk.sources.file_based.config.unstructured_format import (
19
29
  UnstructuredFormat,
20
30
  )
21
31
  from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
22
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
32
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
33
+ AbstractFileBasedStreamReader,
34
+ FileReadMode,
35
+ )
23
36
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
24
37
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
25
38
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
26
39
  from airbyte_cdk.utils import is_cloud_environment
27
40
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
28
- from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, STR_TO_FILETYPE, FileType, detect_filetype
29
41
 
30
42
  unstructured_partition_pdf = None
31
43
  unstructured_partition_docx = None
32
44
  unstructured_partition_pptx = None
45
+ nltk_data_dir = "/tmp/nltk_data"
46
+
47
+ try:
48
+ os.makedirs(nltk_data_dir, exist_ok=True)
49
+ nltk.data.path.append(nltk_data_dir)
50
+ nltk.data.find("tokenizers/punkt.zip")
51
+ nltk.data.find("tokenizers/punkt_tab.zip")
52
+ except LookupError:
53
+ nltk.download("punkt", download_dir=nltk_data_dir)
54
+ nltk.download("punkt_tab", download_dir=nltk_data_dir)
33
55
 
34
56
 
35
57
  def optional_decode(contents: Union[str, bytes]) -> str:
@@ -100,16 +122,21 @@ class UnstructuredParser(FileTypeParser):
100
122
  format = _extract_format(config)
101
123
  with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
102
124
  filetype = self._get_filetype(file_handle, file)
103
-
104
125
  if filetype not in self._supported_file_types() and not format.skip_unprocessable_files:
105
- raise self._create_parse_error(file, self._get_file_type_error_message(filetype))
126
+ raise self._create_parse_error(
127
+ file,
128
+ self._get_file_type_error_message(filetype),
129
+ )
106
130
 
107
131
  return {
108
132
  "content": {
109
133
  "type": "string",
110
134
  "description": "Content of the file as markdown. Might be null if the file could not be parsed",
111
135
  },
112
- "document_key": {"type": "string", "description": "Unique identifier of the document, e.g. the file path"},
136
+ "document_key": {
137
+ "type": "string",
138
+ "description": "Unique identifier of the document, e.g. the file path",
139
+ },
113
140
  "_ab_source_file_parse_error": {
114
141
  "type": "string",
115
142
  "description": "Error message if the file could not be parsed even though the file is supported",
@@ -148,26 +175,54 @@ class UnstructuredParser(FileTypeParser):
148
175
  logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
149
176
  else:
150
177
  raise e
178
+ except Exception as e:
179
+ exception_str = str(e)
180
+ logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
181
+ raise e
151
182
 
152
- def _read_file(self, file_handle: IOBase, remote_file: RemoteFile, format: UnstructuredFormat, logger: logging.Logger) -> str:
183
+ def _read_file(
184
+ self,
185
+ file_handle: IOBase,
186
+ remote_file: RemoteFile,
187
+ format: UnstructuredFormat,
188
+ logger: logging.Logger,
189
+ ) -> str:
153
190
  _import_unstructured()
154
- if (not unstructured_partition_pdf) or (not unstructured_partition_docx) or (not unstructured_partition_pptx):
191
+ if (
192
+ (not unstructured_partition_pdf)
193
+ or (not unstructured_partition_docx)
194
+ or (not unstructured_partition_pptx)
195
+ ):
155
196
  # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
156
197
  raise Exception("unstructured library is not available")
157
198
 
158
- filetype = self._get_filetype(file_handle, remote_file)
199
+ filetype: FileType | None = self._get_filetype(file_handle, remote_file)
159
200
 
160
- if filetype == FileType.MD or filetype == FileType.TXT:
201
+ if filetype is None or filetype not in self._supported_file_types():
202
+ raise self._create_parse_error(
203
+ remote_file,
204
+ self._get_file_type_error_message(filetype),
205
+ )
206
+ if filetype in {FileType.MD, FileType.TXT}:
161
207
  file_content: bytes = file_handle.read()
162
208
  decoded_content: str = optional_decode(file_content)
163
209
  return decoded_content
164
- if filetype not in self._supported_file_types():
165
- raise self._create_parse_error(remote_file, self._get_file_type_error_message(filetype))
166
210
  if format.processing.mode == "local":
167
- return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
211
+ return self._read_file_locally(
212
+ file_handle,
213
+ filetype,
214
+ format.strategy,
215
+ remote_file,
216
+ )
168
217
  elif format.processing.mode == "api":
169
218
  try:
170
- result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy, remote_file)
219
+ result: str = self._read_file_remotely_with_retries(
220
+ file_handle,
221
+ format.processing,
222
+ filetype,
223
+ format.strategy,
224
+ remote_file,
225
+ )
171
226
  except Exception as e:
172
227
  # If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
173
228
  #
@@ -175,11 +230,15 @@ class UnstructuredParser(FileTypeParser):
175
230
  # Once this parser leaves experimental stage, we should consider making this a system error instead for issues that might be transient.
176
231
  if isinstance(e, RecordParseError):
177
232
  raise e
178
- raise AirbyteTracedException.from_exception(e, failure_type=FailureType.config_error)
233
+ raise AirbyteTracedException.from_exception(
234
+ e, failure_type=FailureType.config_error
235
+ )
179
236
 
180
237
  return result
181
238
 
182
- def _params_to_dict(self, params: Optional[List[APIParameterConfigModel]], strategy: str) -> Dict[str, Union[str, List[str]]]:
239
+ def _params_to_dict(
240
+ self, params: Optional[List[APIParameterConfigModel]], strategy: str
241
+ ) -> Dict[str, Union[str, List[str]]]:
183
242
  result_dict: Dict[str, Union[str, List[str]]] = {"strategy": strategy}
184
243
  if params is None:
185
244
  return result_dict
@@ -229,9 +288,16 @@ class UnstructuredParser(FileTypeParser):
229
288
 
230
289
  return True, None
231
290
 
232
- @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error)
291
+ @backoff.on_exception(
292
+ backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error
293
+ )
233
294
  def _read_file_remotely_with_retries(
234
- self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
295
+ self,
296
+ file_handle: IOBase,
297
+ format: APIProcessingConfigModel,
298
+ filetype: FileType,
299
+ strategy: str,
300
+ remote_file: RemoteFile,
235
301
  ) -> str:
236
302
  """
237
303
  Read a file remotely, retrying up to 5 times if the error is not caused by user error. This is useful for transient network errors or the API server being overloaded temporarily.
@@ -239,7 +305,12 @@ class UnstructuredParser(FileTypeParser):
239
305
  return self._read_file_remotely(file_handle, format, filetype, strategy, remote_file)
240
306
 
241
307
  def _read_file_remotely(
242
- self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
308
+ self,
309
+ file_handle: IOBase,
310
+ format: APIProcessingConfigModel,
311
+ filetype: FileType,
312
+ strategy: str,
313
+ remote_file: RemoteFile,
243
314
  ) -> str:
244
315
  headers = {"accept": "application/json", "unstructured-api-key": format.api_key}
245
316
 
@@ -247,7 +318,9 @@ class UnstructuredParser(FileTypeParser):
247
318
 
248
319
  file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
249
320
 
250
- response = requests.post(f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data)
321
+ response = requests.post(
322
+ f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data
323
+ )
251
324
 
252
325
  if response.status_code == 422:
253
326
  # 422 means the file couldn't be processed, but the API is working. Treat this as a parsing error (passing an error record to the destination).
@@ -260,9 +333,15 @@ class UnstructuredParser(FileTypeParser):
260
333
 
261
334
  return self._render_markdown(json_response)
262
335
 
263
- def _read_file_locally(self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile) -> str:
336
+ def _read_file_locally(
337
+ self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile
338
+ ) -> str:
264
339
  _import_unstructured()
265
- if (not unstructured_partition_pdf) or (not unstructured_partition_docx) or (not unstructured_partition_pptx):
340
+ if (
341
+ (not unstructured_partition_pdf)
342
+ or (not unstructured_partition_docx)
343
+ or (not unstructured_partition_pptx)
344
+ ):
266
345
  # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
267
346
  raise Exception("unstructured library is not available")
268
347
 
@@ -289,8 +368,14 @@ class UnstructuredParser(FileTypeParser):
289
368
 
290
369
  return self._render_markdown([element.to_dict() for element in elements])
291
370
 
292
- def _create_parse_error(self, remote_file: RemoteFile, message: str) -> RecordParseError:
293
- return RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message)
371
+ def _create_parse_error(
372
+ self,
373
+ remote_file: RemoteFile,
374
+ message: str,
375
+ ) -> RecordParseError:
376
+ return RecordParseError(
377
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
378
+ )
294
379
 
295
380
  def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileType]:
296
381
  """
@@ -311,39 +396,58 @@ class UnstructuredParser(FileTypeParser):
311
396
  # detect_filetype is either using the file name or file content
312
397
  # if possible, try to leverage the file name to detect the file type
313
398
  # if the file name is not available, use the file content
314
- file_type = detect_filetype(
315
- filename=remote_file.uri,
316
- )
317
- if file_type is not None and not file_type == FileType.UNK:
399
+ file_type: FileType | None = None
400
+ try:
401
+ file_type = detect_filetype(
402
+ filename=remote_file.uri,
403
+ )
404
+ except Exception:
405
+ # Path doesn't exist locally. Try something else...
406
+ pass
407
+
408
+ if file_type and file_type != FileType.UNK:
318
409
  return file_type
319
410
 
320
411
  type_based_on_content = detect_filetype(file=file)
412
+ file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
321
413
 
322
- # detect_filetype is reading to read the file content
323
- file.seek(0)
414
+ if type_based_on_content and type_based_on_content != FileType.UNK:
415
+ return type_based_on_content
324
416
 
325
- return type_based_on_content
417
+ extension = "." + remote_file.uri.split(".")[-1].lower()
418
+ if extension in EXT_TO_FILETYPE:
419
+ return EXT_TO_FILETYPE[extension]
420
+
421
+ return None
326
422
 
327
423
  def _supported_file_types(self) -> List[Any]:
328
424
  return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
329
425
 
330
- def _get_file_type_error_message(self, file_type: FileType) -> str:
426
+ def _get_file_type_error_message(
427
+ self,
428
+ file_type: FileType | None,
429
+ ) -> str:
331
430
  supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
332
- return f"File type {file_type} is not supported. Supported file types are {supported_file_types}"
431
+ return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"
333
432
 
334
433
  def _render_markdown(self, elements: List[Any]) -> str:
335
434
  return "\n\n".join((self._convert_to_markdown(el) for el in elements))
336
435
 
337
436
  def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
338
- if dpath.util.get(el, "type") == "Title":
339
- heading_str = "#" * (dpath.util.get(el, "metadata/category_depth", default=1) or 1)
340
- return f"{heading_str} {dpath.util.get(el, 'text')}"
341
- elif dpath.util.get(el, "type") == "ListItem":
342
- return f"- {dpath.util.get(el, 'text')}"
343
- elif dpath.util.get(el, "type") == "Formula":
344
- return f"```\n{dpath.util.get(el, 'text')}\n```"
437
+ if dpath.get(el, "type") == "Title":
438
+ category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
439
+ if not isinstance(category_depth, int):
440
+ category_depth = (
441
+ int(category_depth) if isinstance(category_depth, (str, float)) else 1
442
+ )
443
+ heading_str = "#" * category_depth
444
+ return f"{heading_str} {dpath.get(el, 'text')}"
445
+ elif dpath.get(el, "type") == "ListItem":
446
+ return f"- {dpath.get(el, 'text')}"
447
+ elif dpath.get(el, "type") == "Formula":
448
+ return f"```\n{dpath.get(el, 'text')}\n```"
345
449
  else:
346
- return str(dpath.util.get(el, "text", default=""))
450
+ return str(dpath.get(el, "text", default=""))
347
451
 
348
452
  @property
349
453
  def file_read_mode(self) -> FileReadMode:
@@ -5,7 +5,7 @@
5
5
  from datetime import datetime
6
6
  from typing import Optional
7
7
 
8
- from pydantic import BaseModel
8
+ from pydantic.v1 import BaseModel
9
9
 
10
10
 
11
11
  class RemoteFile(BaseModel):