airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,196 @@
1
+ #
2
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from io import IOBase
7
+ from pathlib import Path
8
+ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
9
+
10
+ import orjson
11
+ import pandas as pd
12
+ from numpy import datetime64, issubdtype
13
+ from numpy import dtype as dtype_
14
+ from pydantic.v1 import BaseModel
15
+
16
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
17
+ ExcelFormat,
18
+ FileBasedStreamConfig,
19
+ )
20
+ from airbyte_cdk.sources.file_based.exceptions import (
21
+ ConfigValidationError,
22
+ FileBasedSourceError,
23
+ RecordParseError,
24
+ )
25
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
26
+ AbstractFileBasedStreamReader,
27
+ FileReadMode,
28
+ )
29
+ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
30
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
31
+ from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
32
+
33
+
34
+ class ExcelParser(FileTypeParser):
35
+ ENCODING = None
36
+
37
+ def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
38
+ """
39
+ ExcelParser does not require config checks, implicit pydantic validation is enough.
40
+ """
41
+ return True, None
42
+
43
+ async def infer_schema(
44
+ self,
45
+ config: FileBasedStreamConfig,
46
+ file: RemoteFile,
47
+ stream_reader: AbstractFileBasedStreamReader,
48
+ logger: logging.Logger,
49
+ ) -> SchemaType:
50
+ """
51
+ Infers the schema of the Excel file by examining its contents.
52
+
53
+ Args:
54
+ config (FileBasedStreamConfig): Configuration for the file-based stream.
55
+ file (RemoteFile): The remote file to be read.
56
+ stream_reader (AbstractFileBasedStreamReader): Reader to read the file.
57
+ logger (logging.Logger): Logger for logging information and errors.
58
+
59
+ Returns:
60
+ SchemaType: Inferred schema of the Excel file.
61
+ """
62
+
63
+ # Validate the format of the config
64
+ self.validate_format(config.format, logger)
65
+
66
+ fields: Dict[str, str] = {}
67
+
68
+ with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
69
+ df = self.open_and_parse_file(fp)
70
+ for column, df_type in df.dtypes.items():
71
+ # Choose the broadest data type if the column's data type differs in dataframes
72
+ prev_frame_column_type = fields.get(column) # type: ignore [call-overload]
73
+ fields[column] = self.dtype_to_json_type( # type: ignore [index]
74
+ prev_frame_column_type,
75
+ df_type,
76
+ )
77
+
78
+ schema = {
79
+ field: (
80
+ {"type": "string", "format": "date-time"}
81
+ if fields[field] == "date-time"
82
+ else {"type": fields[field]}
83
+ )
84
+ for field in fields
85
+ }
86
+ return schema
87
+
88
+ def parse_records(
89
+ self,
90
+ config: FileBasedStreamConfig,
91
+ file: RemoteFile,
92
+ stream_reader: AbstractFileBasedStreamReader,
93
+ logger: logging.Logger,
94
+ discovered_schema: Optional[Mapping[str, SchemaType]] = None,
95
+ ) -> Iterable[Dict[str, Any]]:
96
+ """
97
+ Parses records from an Excel file based on the provided configuration.
98
+
99
+ Args:
100
+ config (FileBasedStreamConfig): Configuration for the file-based stream.
101
+ file (RemoteFile): The remote file to be read.
102
+ stream_reader (AbstractFileBasedStreamReader): Reader to read the file.
103
+ logger (logging.Logger): Logger for logging information and errors.
104
+ discovered_schema (Optional[Mapping[str, SchemaType]]): Discovered schema for validation.
105
+
106
+ Yields:
107
+ Iterable[Dict[str, Any]]: Parsed records from the Excel file.
108
+ """
109
+
110
+ # Validate the format of the config
111
+ self.validate_format(config.format, logger)
112
+
113
+ try:
114
+ # Open and parse the file using the stream reader
115
+ with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
116
+ df = self.open_and_parse_file(fp)
117
+ # Yield records as dictionaries
118
+ # DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson
119
+ # DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior
120
+ # see PR description: https://github.com/airbytehq/airbyte/pull/44444/
121
+ yield from orjson.loads(
122
+ df.to_json(orient="records", date_format="iso", date_unit="us")
123
+ )
124
+
125
+ except Exception as exc:
126
+ # Raise a RecordParseError if any exception occurs during parsing
127
+ raise RecordParseError(
128
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri
129
+ ) from exc
130
+
131
+ @property
132
+ def file_read_mode(self) -> FileReadMode:
133
+ """
134
+ Returns the file read mode for the Excel file.
135
+
136
+ Returns:
137
+ FileReadMode: The file read mode (binary).
138
+ """
139
+ return FileReadMode.READ_BINARY
140
+
141
+ @staticmethod
142
+ def dtype_to_json_type(
143
+ current_type: Optional[str],
144
+ dtype: dtype_, # type: ignore [type-arg]
145
+ ) -> str:
146
+ """
147
+ Convert Pandas DataFrame types to Airbyte Types.
148
+
149
+ Args:
150
+ current_type (Optional[str]): One of the previous types based on earlier dataframes.
151
+ dtype: Pandas DataFrame type.
152
+
153
+ Returns:
154
+ str: Corresponding Airbyte Type.
155
+ """
156
+ number_types = ("int64", "float64")
157
+ if current_type == "string":
158
+ # Previous column values were of the string type, no need to look further.
159
+ return current_type
160
+ if dtype is object:
161
+ return "string"
162
+ if dtype in number_types and (not current_type or current_type == "number"):
163
+ return "number"
164
+ if dtype == "bool" and (not current_type or current_type == "boolean"):
165
+ return "boolean"
166
+ if issubdtype(dtype, datetime64):
167
+ return "date-time"
168
+ return "string"
169
+
170
+ @staticmethod
171
+ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None:
172
+ """
173
+ Validates if the given format is of type ExcelFormat.
174
+
175
+ Args:
176
+ excel_format (Any): The format to be validated.
177
+
178
+ Raises:
179
+ ConfigValidationError: If the format is not ExcelFormat.
180
+ """
181
+ if not isinstance(excel_format, ExcelFormat):
182
+ logger.info(f"Expected ExcelFormat, got {excel_format}")
183
+ raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
184
+
185
+ @staticmethod
186
+ def open_and_parse_file(fp: Union[IOBase, str, Path]) -> pd.DataFrame:
187
+ """
188
+ Opens and parses the Excel file.
189
+
190
+ Args:
191
+ fp: File pointer to the Excel file.
192
+
193
+ Returns:
194
+ pd.DataFrame: Parsed data from the Excel file.
195
+ """
196
+ return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return]
@@ -0,0 +1,30 @@
1
+ #
2
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3
+ #
4
+ import logging
5
+ from typing import Iterable, Tuple
6
+
7
+ from airbyte_cdk.models import AirbyteRecordMessageFileReference
8
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
9
+ from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
10
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
+ from airbyte_cdk.sources.utils.files_directory import get_files_directory
12
+
13
+
14
+ class FileTransfer:
15
+ def __init__(self) -> None:
16
+ self._local_directory = get_files_directory()
17
+
18
+ def upload(
19
+ self,
20
+ file: RemoteFile,
21
+ stream_reader: AbstractFileBasedStreamReader,
22
+ logger: logging.Logger,
23
+ ) -> Iterable[Tuple[FileRecordData, AirbyteRecordMessageFileReference]]:
24
+ try:
25
+ yield stream_reader.upload(
26
+ file=file, local_directory=self._local_directory, logger=logger
27
+ )
28
+ except Exception as ex:
29
+ logger.error("An error has occurred while getting file: %s", str(ex))
30
+ raise ex
@@ -0,0 +1,86 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from abc import ABC, abstractmethod
7
+ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
8
+
9
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
11
+ AbstractFileBasedStreamReader,
12
+ FileReadMode,
13
+ )
14
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
15
+ from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
16
+
17
+ Record = Dict[str, Any]
18
+
19
+
20
+ class FileTypeParser(ABC):
21
+ """
22
+ An abstract class containing methods that must be implemented for each
23
+ supported file type.
24
+ """
25
+
26
+ @property
27
+ def parser_max_n_files_for_schema_inference(self) -> Optional[int]:
28
+ """
29
+ The discovery policy decides how many files are loaded for schema inference. This method can provide a parser-specific override. If it's defined, the smaller of the two values will be used.
30
+ """
31
+ return None
32
+
33
+ @property
34
+ def parser_max_n_files_for_parsability(self) -> Optional[int]:
35
+ """
36
+ The availability policy decides how many files are loaded for checking whether parsing works correctly. This method can provide a parser-specific override. If it's defined, the smaller of the two values will be used.
37
+ """
38
+ return None
39
+
40
+ def get_parser_defined_primary_key(self, config: FileBasedStreamConfig) -> Optional[str]:
41
+ """
42
+ The parser can define a primary key. If no user-defined primary key is provided, this will be used.
43
+ """
44
+ return None
45
+
46
+ @abstractmethod
47
+ def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
48
+ """
49
+ Check whether the config is valid for this file type. If it is, return True and None. If it's not, return False and an error message explaining why it's invalid.
50
+ """
51
+ return True, None
52
+
53
+ @abstractmethod
54
+ async def infer_schema(
55
+ self,
56
+ config: FileBasedStreamConfig,
57
+ file: RemoteFile,
58
+ stream_reader: AbstractFileBasedStreamReader,
59
+ logger: logging.Logger,
60
+ ) -> SchemaType:
61
+ """
62
+ Infer the JSON Schema for this file.
63
+ """
64
+ ...
65
+
66
+ @abstractmethod
67
+ def parse_records(
68
+ self,
69
+ config: FileBasedStreamConfig,
70
+ file: RemoteFile,
71
+ stream_reader: AbstractFileBasedStreamReader,
72
+ logger: logging.Logger,
73
+ discovered_schema: Optional[Mapping[str, SchemaType]],
74
+ ) -> Iterable[Record]:
75
+ """
76
+ Parse and emit each record.
77
+ """
78
+ ...
79
+
80
+ @property
81
+ @abstractmethod
82
+ def file_read_mode(self) -> FileReadMode:
83
+ """
84
+ The mode in which the file should be opened for reading.
85
+ """
86
+ ...
@@ -0,0 +1,145 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import json
6
+ import logging
7
+ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
8
+
9
+ import orjson
10
+
11
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
12
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
13
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
14
+ AbstractFileBasedStreamReader,
15
+ FileReadMode,
16
+ )
17
+ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
18
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
19
+ from airbyte_cdk.sources.file_based.schema_helpers import (
20
+ PYTHON_TYPE_MAPPING,
21
+ SchemaType,
22
+ merge_schemas,
23
+ )
24
+
25
+
26
+ class JsonlParser(FileTypeParser):
27
+ MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000
28
+ ENCODING = "utf8"
29
+
30
+ def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
31
+ """
32
+ JsonlParser does not require config checks, implicit pydantic validation is enough.
33
+ """
34
+ return True, None
35
+
36
+ async def infer_schema(
37
+ self,
38
+ config: FileBasedStreamConfig,
39
+ file: RemoteFile,
40
+ stream_reader: AbstractFileBasedStreamReader,
41
+ logger: logging.Logger,
42
+ ) -> SchemaType:
43
+ """
44
+ Infers the schema for the file by inferring the schema for each line, and merging
45
+ it with the previously-inferred schema.
46
+ """
47
+ inferred_schema: Mapping[str, Any] = {}
48
+
49
+ for entry in self._parse_jsonl_entries(file, stream_reader, logger, read_limit=True):
50
+ line_schema = self._infer_schema_for_record(entry)
51
+ inferred_schema = merge_schemas(inferred_schema, line_schema)
52
+
53
+ return inferred_schema
54
+
55
+ def parse_records(
56
+ self,
57
+ config: FileBasedStreamConfig,
58
+ file: RemoteFile,
59
+ stream_reader: AbstractFileBasedStreamReader,
60
+ logger: logging.Logger,
61
+ discovered_schema: Optional[Mapping[str, SchemaType]],
62
+ ) -> Iterable[Dict[str, Any]]:
63
+ """
64
+ This code supports parsing json objects over multiple lines even though this does not align with the JSONL format. This is for
65
+ backward compatibility reasons i.e. the previous source-s3 parser did support this. The drawback is:
66
+ * performance as the way we support json over multiple lines is very brute forced
67
+ * given that we don't have `newlines_in_values` config to scope the possible inputs, we might parse the whole file before knowing if
68
+ the input is improperly formatted or if the json is over multiple lines
69
+
70
+ The goal is to run the V4 of source-s3 in production, track the warning log emitted when there are multiline json objects and
71
+ deprecate this feature if it's not a valid use case.
72
+ """
73
+ yield from self._parse_jsonl_entries(file, stream_reader, logger)
74
+
75
+ @classmethod
76
+ def _infer_schema_for_record(cls, record: Dict[str, Any]) -> Dict[str, Any]:
77
+ record_schema = {}
78
+ for key, value in record.items():
79
+ if value is None:
80
+ record_schema[key] = {"type": "null"}
81
+ else:
82
+ record_schema[key] = {"type": PYTHON_TYPE_MAPPING[type(value)]}
83
+
84
+ return record_schema
85
+
86
+ @property
87
+ def file_read_mode(self) -> FileReadMode:
88
+ return FileReadMode.READ
89
+
90
+ def _parse_jsonl_entries(
91
+ self,
92
+ file: RemoteFile,
93
+ stream_reader: AbstractFileBasedStreamReader,
94
+ logger: logging.Logger,
95
+ read_limit: bool = False,
96
+ ) -> Iterable[Dict[str, Any]]:
97
+ with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
98
+ read_bytes = 0
99
+
100
+ had_json_parsing_error = False
101
+ has_warned_for_multiline_json_object = False
102
+ yielded_at_least_once = False
103
+
104
+ accumulator = None
105
+ for line in fp:
106
+ if not accumulator:
107
+ accumulator = self._instantiate_accumulator(line)
108
+ read_bytes += len(line)
109
+ accumulator += line # type: ignore [operator] # In reality, it's either bytes or string and we add the same type
110
+ try:
111
+ record = orjson.loads(accumulator)
112
+ if had_json_parsing_error and not has_warned_for_multiline_json_object:
113
+ logger.warning(
114
+ f"File at {file.uri} is using multiline JSON. Performance could be greatly reduced"
115
+ )
116
+ has_warned_for_multiline_json_object = True
117
+
118
+ yield record
119
+ yielded_at_least_once = True
120
+ accumulator = self._instantiate_accumulator(line)
121
+ except orjson.JSONDecodeError:
122
+ had_json_parsing_error = True
123
+
124
+ if (
125
+ read_limit
126
+ and yielded_at_least_once
127
+ and read_bytes >= self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE
128
+ ):
129
+ logger.warning(
130
+ f"Exceeded the maximum number of bytes per file for schema inference ({self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE}). "
131
+ f"Inferring schema from an incomplete set of records."
132
+ )
133
+ break
134
+
135
+ if had_json_parsing_error and not yielded_at_least_once:
136
+ raise RecordParseError(
137
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line
138
+ )
139
+
140
+ @staticmethod
141
+ def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:
142
+ if isinstance(line, bytes):
143
+ return bytes("", json.detect_encoding(line))
144
+ elif isinstance(line, str):
145
+ return ""