classifyre-cli 0.4.36__tar.gz → 0.4.37__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. classifyre_cli-0.4.37/.turbo/turbo-build.log +3 -0
  2. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/package.json +1 -1
  4. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/pyproject.toml +24 -27
  5. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/models/generated_input.py +116 -341
  6. classifyre_cli-0.4.37/src/sources/delta_lake/source.py +146 -0
  7. classifyre_cli-0.4.37/src/sources/iceberg/source.py +155 -0
  8. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/kafka/source.py +161 -100
  9. classifyre_cli-0.4.37/src/sources/lakehouse_base.py +407 -0
  10. classifyre_cli-0.4.37/src/sources/s3_client.py +72 -0
  11. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/s3_compatible_storage/source.py +9 -40
  12. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/utils/dependency_groups.py +0 -2
  13. classifyre_cli-0.4.37/tests/_lakehouse_fakes.py +41 -0
  14. classifyre_cli-0.4.37/tests/test_delta_lake_source.py +165 -0
  15. classifyre_cli-0.4.37/tests/test_iceberg_source.py +175 -0
  16. classifyre_cli-0.4.37/tests/test_kafka_source.py +251 -0
  17. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_source_dependency_groups.py +5 -3
  18. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/uv.lock +479 -228
  19. classifyre_cli-0.4.36/.turbo/turbo-build.log +0 -3
  20. classifyre_cli-0.4.36/src/sources/delta_lake/source.py +0 -139
  21. classifyre_cli-0.4.36/src/sources/hudi/__init__.py +0 -3
  22. classifyre_cli-0.4.36/src/sources/hudi/source.py +0 -98
  23. classifyre_cli-0.4.36/src/sources/iceberg/source.py +0 -148
  24. classifyre_cli-0.4.36/src/sources/spark_base.py +0 -413
  25. classifyre_cli-0.4.36/src/sources/spark_catalog/__init__.py +0 -3
  26. classifyre_cli-0.4.36/src/sources/spark_catalog/source.py +0 -93
  27. classifyre_cli-0.4.36/src/utils/spark_runtime.py +0 -56
  28. classifyre_cli-0.4.36/tests/_spark_fakes.py +0 -125
  29. classifyre_cli-0.4.36/tests/test_delta_lake_source.py +0 -96
  30. classifyre_cli-0.4.36/tests/test_hudi_source.py +0 -72
  31. classifyre_cli-0.4.36/tests/test_iceberg_source.py +0 -95
  32. classifyre_cli-0.4.36/tests/test_kafka_source.py +0 -192
  33. classifyre_cli-0.4.36/tests/test_spark_catalog_source.py +0 -71
  34. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/.gitignore +0 -0
  35. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/.python-version +0 -0
  36. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/README.md +0 -0
  37. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/main.py +0 -0
  38. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/scripts/generate_models.py +0 -0
  39. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/__init__.py +0 -0
  40. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/config.py +0 -0
  41. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/__init__.py +0 -0
  42. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/base.py +0 -0
  43. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/broken_links/__init__.py +0 -0
  44. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/broken_links/detector.py +0 -0
  45. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/config.py +0 -0
  46. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/content/__init__.py +0 -0
  47. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/__init__.py +0 -0
  48. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/detector.py +0 -0
  49. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/extractor.py +0 -0
  50. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/runners/__init__.py +0 -0
  51. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_base.py +0 -0
  52. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_factory.py +0 -0
  53. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  54. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_gliner2.py +0 -0
  55. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_image_classification.py +0 -0
  56. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_llm.py +0 -0
  57. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_object_detection.py +0 -0
  58. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_regex.py +0 -0
  59. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_text_classification.py +0 -0
  60. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/custom/trainer.py +0 -0
  61. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/dependencies.py +0 -0
  62. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/pii/__init__.py +0 -0
  63. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/pii/detector.py +0 -0
  64. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/secrets/__init__.py +0 -0
  65. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/secrets/detector.py +0 -0
  66. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/threat/__init__.py +0 -0
  67. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/threat/code_security_detector.py +0 -0
  68. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/detectors/threat/yara_detector.py +0 -0
  69. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/main.py +0 -0
  70. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/models/generated_detectors.py +0 -0
  71. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/models/generated_single_asset_scan_results.py +0 -0
  72. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/outputs/__init__.py +0 -0
  73. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/outputs/base.py +0 -0
  74. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/outputs/console.py +0 -0
  75. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/outputs/factory.py +0 -0
  76. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/outputs/file.py +0 -0
  77. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/outputs/rest.py +0 -0
  78. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/pipeline/__init__.py +0 -0
  79. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/pipeline/content_provider.py +0 -0
  80. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/pipeline/detector_pipeline.py +0 -0
  81. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/pipeline/parsed_content_provider.py +0 -0
  82. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/pipeline/worker_pool.py +0 -0
  83. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sandbox/__init__.py +0 -0
  84. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sandbox/runner.py +0 -0
  85. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/__init__.py +0 -0
  86. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/asset_metadata.py +0 -0
  87. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/atlassian_common.py +0 -0
  88. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/azure_blob_storage/__init__.py +0 -0
  89. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/azure_blob_storage/source.py +0 -0
  90. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/base.py +0 -0
  91. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/confluence/__init__.py +0 -0
  92. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/confluence/source.py +0 -0
  93. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/databricks/__init__.py +0 -0
  94. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/databricks/source.py +0 -0
  95. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/delta_lake/__init__.py +0 -0
  96. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/dependencies.py +0 -0
  97. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/elasticsearch/__init__.py +0 -0
  98. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/elasticsearch/source.py +0 -0
  99. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/email/__init__.py +0 -0
  100. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/email/source.py +0 -0
  101. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/google_cloud_storage/__init__.py +0 -0
  102. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/google_cloud_storage/source.py +0 -0
  103. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/hive/__init__.py +0 -0
  104. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/hive/source.py +0 -0
  105. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/iceberg/__init__.py +0 -0
  106. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/jira/__init__.py +0 -0
  107. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/jira/source.py +0 -0
  108. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/kafka/__init__.py +0 -0
  109. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/meilisearch/__init__.py +0 -0
  110. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/meilisearch/source.py +0 -0
  111. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/mongodb/__init__.py +0 -0
  112. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/mongodb/source.py +0 -0
  113. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/mssql/__init__.py +0 -0
  114. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/mssql/source.py +0 -0
  115. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/mysql/__init__.py +0 -0
  116. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/mysql/source.py +0 -0
  117. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/neo4j/__init__.py +0 -0
  118. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/neo4j/source.py +0 -0
  119. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/notion/__init__.py +0 -0
  120. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/notion/client.py +0 -0
  121. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/notion/source.py +0 -0
  122. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/object_storage/base.py +0 -0
  123. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/opensearch/__init__.py +0 -0
  124. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/opensearch/source.py +0 -0
  125. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/oracle/__init__.py +0 -0
  126. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/oracle/source.py +0 -0
  127. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/postgresql/__init__.py +0 -0
  128. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/postgresql/source.py +0 -0
  129. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/powerbi/__init__.py +0 -0
  130. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/powerbi/source.py +0 -0
  131. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/recipe_normalizer.py +0 -0
  132. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/s3_compatible_storage/README.md +0 -0
  133. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/s3_compatible_storage/__init__.py +0 -0
  134. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/search_engine_base.py +0 -0
  135. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/servicedesk/__init__.py +0 -0
  136. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/servicedesk/source.py +0 -0
  137. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/slack/__init__.py +0 -0
  138. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/slack/source.py +0 -0
  139. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/snowflake/__init__.py +0 -0
  140. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/snowflake/source.py +0 -0
  141. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/sqlite/__init__.py +0 -0
  142. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/sqlite/source.py +0 -0
  143. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/tableau/__init__.py +0 -0
  144. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/tableau/source.py +0 -0
  145. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/tabular_base.py +0 -0
  146. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/tabular_utils.py +0 -0
  147. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/wordpress/__init__.py +0 -0
  148. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/wordpress/source.py +0 -0
  149. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/youtube/__init__.py +0 -0
  150. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/sources/youtube/source.py +0 -0
  151. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/telemetry.py +0 -0
  152. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/utils/__init__.py +0 -0
  153. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/utils/content_extraction.py +0 -0
  154. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/utils/embedded_images.py +0 -0
  155. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/utils/file_metadata.py +0 -0
  156. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/utils/file_parser.py +0 -0
  157. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/utils/file_to_images.py +0 -0
  158. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/utils/hashing.py +0 -0
  159. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/utils/resources.py +0 -0
  160. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/utils/transcription.py +0 -0
  161. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/utils/uv_sync.py +0 -0
  162. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/src/utils/validation.py +0 -0
  163. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/__init__.py +0 -0
  164. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/conftest.py +0 -0
  165. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/__init__.py +0 -0
  166. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  167. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/conftest.py +0 -0
  168. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/content/__init__.py +0 -0
  169. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/custom/__init__.py +0 -0
  170. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/custom/conftest.py +0 -0
  171. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  172. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/custom/test_llm_runner.py +0 -0
  173. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  174. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/custom/test_regex_runner.py +0 -0
  175. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/custom/test_transformer_runners.py +0 -0
  176. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/pii/__init__.py +0 -0
  177. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/pii/conftest.py +0 -0
  178. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/pii/sample_invoice.pdf +0 -0
  179. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/pii/test_pii_detector.py +0 -0
  180. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  181. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/secrets/__init__.py +0 -0
  182. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  183. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  184. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/test_base_detector.py +0 -0
  185. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  186. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  187. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/test_detector_pipeline_types.py +0 -0
  188. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/test_detector_schema_examples.py +0 -0
  189. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/test_detector_types.py +0 -0
  190. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/test_phase2_detectors.py +0 -0
  191. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/test_registry.py +0 -0
  192. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/threat/__init__.py +0 -0
  193. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/threat/test_code_security_detector.py +0 -0
  194. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/detectors/threat/test_yara_detector.py +0 -0
  195. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  196. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/integration/test_wordpress_links_assets.py +0 -0
  197. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/pipeline/test_detector_pipeline.py +0 -0
  198. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/pipeline/test_worker_pool.py +0 -0
  199. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_assets_metadata_catalog.py +0 -0
  200. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_azure_blob_storage_source.py +0 -0
  201. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_base_source_attachment.py +0 -0
  202. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_base_source_sampling.py +0 -0
  203. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_config.py +0 -0
  204. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_confluence_source.py +0 -0
  205. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_custom_extractor.py +0 -0
  206. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_databricks_source.py +0 -0
  207. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_dependency_groups.py +0 -0
  208. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_elasticsearch_source.py +0 -0
  209. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_email_source.py +0 -0
  210. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_google_cloud_storage_source.py +0 -0
  211. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_hashing.py +0 -0
  212. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_hive_source.py +0 -0
  213. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_jira_source.py +0 -0
  214. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_meilisearch_source.py +0 -0
  215. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_mongodb_source.py +0 -0
  216. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_mssql_source.py +0 -0
  217. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_mysql_source.py +0 -0
  218. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_neo4j_source.py +0 -0
  219. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_notion_source.py +0 -0
  220. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_opensearch_source.py +0 -0
  221. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_oracle_source.py +0 -0
  222. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_outputs.py +0 -0
  223. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_postgresql_source.py +0 -0
  224. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_powerbi_source.py +0 -0
  225. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_recipe_normalizer.py +0 -0
  226. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_s3_compatible_storage_source.py +0 -0
  227. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_sampling_automatic.py +0 -0
  228. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_sandbox_runner.py +0 -0
  229. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_servicedesk_source.py +0 -0
  230. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_slack_source.py +0 -0
  231. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_snowflake_source.py +0 -0
  232. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_sqlite_source.py +0 -0
  233. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_tableau_source.py +0 -0
  234. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_tabular_automatic_sampling.py +0 -0
  235. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_tabular_utils.py +0 -0
  236. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_uv_sync.py +0 -0
  237. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_wordpress_source.py +0 -0
  238. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_youtube_source.py +0 -0
  239. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/test_youtube_source_integration.py +0 -0
  240. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/utils/test_content_extraction.py +0 -0
  241. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/utils/test_embedded_images.py +0 -0
  242. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/utils/test_file_metadata.py +0 -0
  243. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/utils/test_file_parser.py +0 -0
  244. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/utils/test_file_to_images.py +0 -0
  245. {classifyre_cli-0.4.36 → classifyre_cli-0.4.37}/tests/utils/test_transcription.py +0 -0
@@ -0,0 +1,3 @@
1
+ $ uv sync
2
+ Resolved 276 packages in 336ms
3
+ Checked 51 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.36
3
+ Version: 0.4.37
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.36",
3
+ "version": "0.4.37",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.36"
3
+ version = "0.4.37"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -51,6 +51,8 @@ dev = [
51
51
  "datamodel-code-generator>=0.57.0",
52
52
  "pytest>=9.0.3",
53
53
  "pytest-asyncio>=0.24.0",
54
+ # Real SQL engine for the lakehouse source tests (sampling over Parquet).
55
+ "duckdb>=1.1.0",
54
56
  ]
55
57
  privacy = [
56
58
  "presidio-analyzer>=2.2.362",
@@ -134,6 +136,9 @@ file-processing = [
134
136
  "openpyxl>=3.1.5",
135
137
  "chardet>=7.4.3",
136
138
  "pyarrow>=18.0.0",
139
+ # Table layer: Iceberg metadata resolution + SQL over Parquet without Spark.
140
+ "pyiceberg>=0.9.0",
141
+ "duckdb>=1.1.0",
137
142
  ]
138
143
  transcription = [
139
144
  # CPU audio/video transcription. faster-whisper decodes media via bundled
@@ -202,37 +207,23 @@ youtube = [
202
207
  "yt-dlp>=2025.1.0",
203
208
  "youtube-transcript-api>=1.0.0",
204
209
  ]
205
- spark = [
206
- # Shared PySpark runtime for the lakehouse sources. Requires a JDK (Java 21
207
- # LTS) on the host; format JARs (Delta/Iceberg/Hudi) are resolved at runtime
208
- # via spark.jars.packages (see SPARK_* env vars). Pinned to the Spark 4.1
209
- # minor so the format-JAR coordinates below stay version-matched.
210
- #
211
- # The `connect` extra pulls in pandas>=2.2, pyarrow, and grpcio, which the
212
- # Spark Connect client (Spark Catalog via sc:// URLs) hard-requires at session
213
- # build time. Classic/local Spark (Delta/Hudi/Iceberg) does not need them, but
214
- # they share this group, so we ship the client deps once for all of them.
215
- "pyspark[connect]>=4.1,<4.2",
216
- ]
217
210
  delta-lake = [
218
- { include-group = "spark" },
219
- "delta-spark>=4.0",
220
- ]
221
- hudi = [
222
- # Hudi Spark integration ships as Maven JARs configured via
223
- # SPARK_JARS_PACKAGES; only the shared PySpark runtime is needed here.
224
- { include-group = "spark" },
225
- ]
226
- spark-catalog = [
227
- { include-group = "spark" },
211
+ # Pure-Python/Rust Delta reader (delta-rs) — no Spark, no JVM. Table metadata
212
+ # and data-file lists come from deltalake; rows are sampled through DuckDB
213
+ # (file-processing group) over the table's Parquet files.
214
+ { include-group = "file-processing" },
215
+ "deltalake>=0.25.0",
216
+ "boto3>=1.43.10,<2.0.0",
228
217
  ]
229
218
  iceberg = [
230
- # Inspected through Spark (iceberg-spark-runtime JARs via SPARK_JARS_PACKAGES),
231
- # sharing the JVM the other lakehouse sources already require.
232
- { include-group = "spark" },
219
+ # PyIceberg resolves table metadata and finds Parquet files without Spark;
220
+ # DuckDB (file-processing group) samples rows from those files.
221
+ { include-group = "file-processing" },
222
+ "boto3>=1.43.10,<2.0.0",
233
223
  ]
234
224
  kafka = [
235
- "kafka-python>=2.0",
225
+ # librdkafka-based client (bundled wheels, no JVM).
226
+ "confluent-kafka>=2.5.0",
236
227
  ]
237
228
  otel = [
238
229
  "opentelemetry-sdk>=1.42.0",
@@ -353,6 +344,12 @@ module = [
353
344
  "re2.*",
354
345
  "faster_whisper",
355
346
  "faster_whisper.*",
347
+ "pyiceberg.*",
348
+ "deltalake.*",
349
+ "duckdb.*",
350
+ "confluent_kafka.*",
351
+ "boto3.*",
352
+ "botocore.*",
356
353
  ]
357
354
  ignore_missing_imports = true
358
355
 
@@ -47,8 +47,6 @@ class AssetType(StrEnum):
47
47
  YOUTUBE = 'YOUTUBE'
48
48
  DELTA_LAKE = 'DELTA_LAKE'
49
49
  ICEBERG = 'ICEBERG'
50
- HUDI = 'HUDI'
51
- SPARK_CATALOG = 'SPARK_CATALOG'
52
50
  KAFKA = 'KAFKA'
53
51
  ELASTICSEARCH = 'ELASTICSEARCH'
54
52
  OPENSEARCH = 'OPENSEARCH'
@@ -353,8 +351,6 @@ class Type(StrEnum):
353
351
  YOUTUBE = 'YOUTUBE'
354
352
  DELTA_LAKE = 'DELTA_LAKE'
355
353
  ICEBERG = 'ICEBERG'
356
- HUDI = 'HUDI'
357
- SPARK_CATALOG = 'SPARK_CATALOG'
358
354
  KAFKA = 'KAFKA'
359
355
  ELASTICSEARCH = 'ELASTICSEARCH'
360
356
  OPENSEARCH = 'OPENSEARCH'
@@ -2868,8 +2864,6 @@ class Type19(StrEnum):
2868
2864
  YOUTUBE = 'YOUTUBE'
2869
2865
  DELTA_LAKE = 'DELTA_LAKE'
2870
2866
  ICEBERG = 'ICEBERG'
2871
- HUDI = 'HUDI'
2872
- SPARK_CATALOG = 'SPARK_CATALOG'
2873
2867
  KAFKA = 'KAFKA'
2874
2868
  ELASTICSEARCH = 'ELASTICSEARCH'
2875
2869
  OPENSEARCH = 'OPENSEARCH'
@@ -3117,17 +3111,6 @@ class NotionInput(CoreInput):
3117
3111
  resources: ResourceOverrides | None = None
3118
3112
 
3119
3113
 
3120
- class IcebergCatalogType(StrEnum):
3121
- """
3122
- PyIceberg catalog backend type
3123
- """
3124
-
3125
- REST = 'REST'
3126
- HIVE = 'HIVE'
3127
- GLUE = 'GLUE'
3128
- SQL = 'SQL'
3129
-
3130
-
3131
3114
  class KafkaSecurityProtocol(StrEnum):
3132
3115
  """
3133
3116
  Kafka client security protocol
@@ -3153,351 +3136,41 @@ class DeltaLakeRequired(BaseModel):
3153
3136
  model_config = ConfigDict(
3154
3137
  extra='forbid',
3155
3138
  )
3156
- warehouse_path: str = Field(
3139
+ bucket: str = Field(
3157
3140
  ...,
3158
- description='Root storage location holding Delta tables (e.g. s3a://lake/warehouse, file:///data)',
3159
- )
3160
-
3161
-
3162
- class DeltaLakeMasked(BaseModel):
3163
- """
3164
- Optional object-store credentials for the warehouse location.
3165
- """
3166
-
3167
- model_config = ConfigDict(
3168
- extra='forbid',
3169
- )
3170
- s3_access_key_id: str | None = Field(
3171
- None, description='S3 access key id for object-store warehouses'
3172
- )
3173
- s3_secret_access_key: str | None = Field(None, description='S3 secret access key')
3174
- s3_session_token: str | None = Field(None, description='Optional S3 session token')
3175
-
3176
-
3177
- class DeltaLakeOptionalConnection(BaseModel):
3178
- """
3179
- Delta Lake connection and storage options.
3180
- """
3181
-
3182
- model_config = ConfigDict(
3183
- extra='forbid',
3184
- )
3185
- metastore_uri: str | None = Field(
3186
- None,
3187
- description='Hive Metastore thrift URI; enables catalog-based table discovery',
3188
- )
3189
- endpoint_url: str | None = Field(
3190
- None, description='Custom S3-compatible endpoint URL'
3141
+ description='Bucket holding the tables AWS S3, MinIO, Cloudflare R2, Backblaze B2, Garage, and other S3-compatible endpoints',
3191
3142
  )
3192
- region: str | None = Field(None, description='Object-store region')
3193
3143
 
3194
3144
 
3195
3145
  class DeltaLakeOptionalScope(BaseModel):
3196
3146
  """
3197
- Delta Lake database and table selection scope.
3198
- """
3199
-
3200
- model_config = ConfigDict(
3201
- extra='forbid',
3202
- )
3203
- database: str | None = Field(
3204
- None, description='Single database/namespace to scan (catalog mode)'
3205
- )
3206
- include_all_databases: bool | None = Field(
3207
- False, description='Scan all visible databases except excluded system databases'
3208
- )
3209
- exclude_databases: list[str] | None = Field(
3210
- ['information_schema', 'sys'], description='Database denylist (exact names)'
3211
- )
3212
- include_tables: list[str] | None = Field(
3213
- None,
3214
- description='Optional table allowlist. Accepted forms: table or database.table',
3215
- )
3216
- table_limit: int | None = Field(
3217
- None, description='Optional cap on number of table assets per database', ge=1
3218
- )
3219
- table_paths: list[str] | None = Field(
3220
- None,
3221
- description='Explicit Delta table locations to scan when no metastore is configured',
3222
- )
3223
-
3224
-
3225
- class DeltaLakeOptional(BaseModel):
3226
- model_config = ConfigDict(
3227
- extra='forbid',
3228
- )
3229
- connection: DeltaLakeOptionalConnection | None = None
3230
- scope: DeltaLakeOptionalScope | None = None
3231
-
3232
-
3233
- class DeltaLakeInput(CoreInput):
3234
- type: Literal['DELTA_LAKE'] | None = Field(
3235
- None, description='Type of the asset or source'
3236
- )
3237
- required: DeltaLakeRequired
3238
- masked: DeltaLakeMasked | None = None
3239
- optional: DeltaLakeOptional | None = None
3240
- detectors: list[Detector] | None = Field(
3241
- None, description='Detectors to run on ingested content'
3242
- )
3243
- custom_detectors: list[CustomDetectorSelection] | None = Field(
3244
- None,
3245
- description='Reusable custom detector IDs selected from the custom detector catalog.',
3246
- )
3247
- sampling: SamplingConfig
3248
- resources: ResourceOverrides | None = None
3249
-
3250
-
3251
- class HudiRequired(BaseModel):
3252
- model_config = ConfigDict(
3253
- extra='forbid',
3254
- )
3255
- warehouse_path: str = Field(
3256
- ..., description='Root storage location holding Hudi tables'
3257
- )
3258
-
3259
-
3260
- class HudiMasked(BaseModel):
3261
- """
3262
- Optional object-store credentials for the warehouse location.
3263
- """
3264
-
3265
- model_config = ConfigDict(
3266
- extra='forbid',
3267
- )
3268
- s3_access_key_id: str | None = Field(
3269
- None, description='S3 access key id for object-store warehouses'
3270
- )
3271
- s3_secret_access_key: str | None = Field(None, description='S3 secret access key')
3272
- s3_session_token: str | None = Field(None, description='Optional S3 session token')
3273
-
3274
-
3275
- class HudiOptionalConnection(BaseModel):
3276
- """
3277
- Hudi connection and storage options.
3147
+ Delta Lake table selection scope within the bucket.
3278
3148
  """
3279
3149
 
3280
3150
  model_config = ConfigDict(
3281
3151
  extra='forbid',
3282
3152
  )
3283
- metastore_uri: str | None = Field(
3284
- None,
3285
- description='Hive Metastore thrift URI; enables catalog-based table discovery',
3286
- )
3287
- endpoint_url: str | None = Field(
3288
- None, description='Custom S3-compatible endpoint URL'
3289
- )
3290
- region: str | None = Field(None, description='Object-store region')
3291
-
3292
-
3293
- class HudiOptionalScope(BaseModel):
3294
- """
3295
- Hudi database and table selection scope.
3296
- """
3297
-
3298
- model_config = ConfigDict(
3299
- extra='forbid',
3300
- )
3301
- database: str | None = Field(
3302
- None, description='Single database/namespace to scan (catalog mode)'
3303
- )
3304
- include_all_databases: bool | None = Field(
3305
- False, description='Scan all visible databases except excluded system databases'
3306
- )
3307
- exclude_databases: list[str] | None = Field(
3308
- ['information_schema', 'sys'], description='Database denylist (exact names)'
3309
- )
3310
- include_tables: list[str] | None = Field(
3153
+ prefix: str | None = Field(
3311
3154
  None,
3312
- description='Optional table allowlist. Accepted forms: table or database.table',
3313
- )
3314
- table_limit: int | None = Field(
3315
- None, description='Optional cap on number of table assets per database', ge=1
3155
+ description='Key prefix to search for Delta Lake tables (e.g. warehouse/). Tables are auto-discovered by their _delta_log/ directory.',
3316
3156
  )
3317
3157
  table_paths: list[str] | None = Field(
3318
3158
  None,
3319
- description='Explicit Hudi table locations to scan when no metastore is configured',
3320
- )
3321
-
3322
-
3323
- class HudiOptional(BaseModel):
3324
- model_config = ConfigDict(
3325
- extra='forbid',
3326
- )
3327
- connection: HudiOptionalConnection | None = None
3328
- scope: HudiOptionalScope | None = None
3329
-
3330
-
3331
- class HudiInput(CoreInput):
3332
- type: Literal['HUDI'] | None = Field(
3333
- None, description='Type of the asset or source'
3334
- )
3335
- required: HudiRequired
3336
- masked: HudiMasked | None = None
3337
- optional: HudiOptional | None = None
3338
- detectors: list[Detector] | None = Field(
3339
- None, description='Detectors to run on ingested content'
3340
- )
3341
- custom_detectors: list[CustomDetectorSelection] | None = Field(
3342
- None,
3343
- description='Reusable custom detector IDs selected from the custom detector catalog.',
3344
- )
3345
- sampling: SamplingConfig
3346
- resources: ResourceOverrides | None = None
3347
-
3348
-
3349
- class SparkCatalogRequired(BaseModel):
3350
- model_config = ConfigDict(
3351
- extra='forbid',
3352
- )
3353
- connect_url: str = Field(
3354
- ...,
3355
- description='Spark Connect endpoint (sc://host:15002) or classic master (spark://host:7077)',
3356
- )
3357
-
3358
-
3359
- class SparkCatalogMasked(BaseModel):
3360
- """
3361
- Optional Spark Connect authentication.
3362
- """
3363
-
3364
- model_config = ConfigDict(
3365
- extra='forbid',
3366
- )
3367
- token: str | None = Field(
3368
- None, description='Bearer token for Spark Connect authentication'
3369
- )
3370
-
3371
-
3372
- class SparkCatalogOptionalScope(BaseModel):
3373
- """
3374
- Spark catalog and table selection scope.
3375
- """
3376
-
3377
- model_config = ConfigDict(
3378
- extra='forbid',
3379
- )
3380
- catalog: str | None = Field(
3381
- None, description='Spark catalog name to scan (defaults to the session catalog)'
3382
- )
3383
- database: str | None = Field(None, description='Single database/namespace to scan')
3384
- include_all_databases: bool | None = Field(
3385
- False, description='Scan all visible databases except excluded system databases'
3386
- )
3387
- exclude_databases: list[str] | None = Field(
3388
- ['information_schema', 'sys'], description='Database denylist (exact names)'
3389
- )
3390
- include_tables: list[str] | None = Field(
3391
- None,
3392
- description='Optional table allowlist. Accepted forms: table or database.table',
3159
+ description='Explicit Delta Lake table root keys or s3:// URIs. When set, auto-discovery under prefix is skipped.',
3393
3160
  )
3394
3161
  table_limit: int | None = Field(
3395
- None, description='Optional cap on number of table assets per database', ge=1
3162
+ None, description='Optional cap on number of table assets', ge=1
3396
3163
  )
3397
3164
 
3398
3165
 
3399
- class SparkCatalogOptional(BaseModel):
3400
- model_config = ConfigDict(
3401
- extra='forbid',
3402
- )
3403
- scope: SparkCatalogOptionalScope | None = None
3404
-
3405
-
3406
- class SparkCatalogInput(CoreInput):
3407
- type: Literal['SPARK_CATALOG'] | None = Field(
3408
- None, description='Type of the asset or source'
3409
- )
3410
- required: SparkCatalogRequired
3411
- masked: SparkCatalogMasked | None = None
3412
- optional: SparkCatalogOptional | None = None
3413
- detectors: list[Detector] | None = Field(
3414
- None, description='Detectors to run on ingested content'
3415
- )
3416
- custom_detectors: list[CustomDetectorSelection] | None = Field(
3417
- None,
3418
- description='Reusable custom detector IDs selected from the custom detector catalog.',
3419
- )
3420
- sampling: SamplingConfig
3421
- resources: ResourceOverrides | None = None
3422
-
3423
-
3424
3166
  class IcebergRequired(BaseModel):
3425
3167
  model_config = ConfigDict(
3426
3168
  extra='forbid',
3427
3169
  )
3428
- catalog_type: IcebergCatalogType
3429
- catalog_uri: str | None = Field(
3430
- None,
3431
- description='Catalog URI (REST endpoint, Hive metastore thrift URI, or SQL DSN). Not required for GLUE.',
3432
- )
3433
- warehouse: str = Field(
3434
- ..., description='Warehouse location root (e.g. s3://bucket/warehouse)'
3435
- )
3436
-
3437
-
3438
- class IcebergMasked(BaseModel):
3439
- """
3440
- Optional Iceberg catalog/storage credentials.
3441
- """
3442
-
3443
- model_config = ConfigDict(
3444
- extra='forbid',
3445
- )
3446
- token: str | None = Field(None, description='Bearer token for a REST catalog')
3447
- aws_access_key_id: str | None = Field(
3448
- None, description='AWS access key id (Glue/S3)'
3449
- )
3450
- aws_secret_access_key: str | None = Field(
3451
- None, description='AWS secret access key (Glue/S3)'
3452
- )
3453
-
3454
-
3455
- class IcebergOptionalScope(BaseModel):
3456
- """
3457
- Iceberg namespace and table selection scope.
3458
- """
3459
-
3460
- model_config = ConfigDict(
3461
- extra='forbid',
3462
- )
3463
- namespace: str | None = Field(
3464
- None, description='Single namespace to scan (dotted form supported)'
3465
- )
3466
- include_all_namespaces: bool | None = Field(
3467
- False, description='Scan all visible namespaces'
3468
- )
3469
- include_tables: list[str] | None = Field(
3470
- None,
3471
- description='Optional table allowlist. Accepted forms: table or namespace.table',
3472
- )
3473
- table_limit: int | None = Field(
3474
- None, description='Optional cap on number of table assets per namespace', ge=1
3475
- )
3476
-
3477
-
3478
- class IcebergOptional(BaseModel):
3479
- model_config = ConfigDict(
3480
- extra='forbid',
3481
- )
3482
- scope: IcebergOptionalScope | None = None
3483
-
3484
-
3485
- class IcebergInput(CoreInput):
3486
- type: Literal['ICEBERG'] | None = Field(
3487
- None, description='Type of the asset or source'
3488
- )
3489
- required: IcebergRequired
3490
- masked: IcebergMasked | None = None
3491
- optional: IcebergOptional | None = None
3492
- detectors: list[Detector] | None = Field(
3493
- None, description='Detectors to run on ingested content'
3494
- )
3495
- custom_detectors: list[CustomDetectorSelection] | None = Field(
3496
- None,
3497
- description='Reusable custom detector IDs selected from the custom detector catalog.',
3170
+ bucket: str = Field(
3171
+ ...,
3172
+ description='Bucket holding the tables — AWS S3, MinIO, Cloudflare R2, Backblaze B2, Garage, and other S3-compatible endpoints',
3498
3173
  )
3499
- sampling: SamplingConfig
3500
- resources: ResourceOverrides | None = None
3501
3174
 
3502
3175
 
3503
3176
  class NoAuthentication(BaseModel):
@@ -3881,6 +3554,60 @@ class MeilisearchInput(CoreInput):
3881
3554
  resources: ResourceOverrides | None = None
3882
3555
 
3883
3556
 
3557
+ class LakehouseStorageConnection(BaseModel):
3558
+ """
3559
+ S3-compatible storage connection options (AWS S3, MinIO, Cloudflare R2, Backblaze B2, Garage, ...). Mirrors the S3 Compatible Storage source connection settings.
3560
+ """
3561
+
3562
+ model_config = ConfigDict(
3563
+ extra='forbid',
3564
+ )
3565
+ region_name: str | None = Field(
3566
+ None,
3567
+ description='Region (recommended for AWS; required by some S3-compatible providers)',
3568
+ )
3569
+ endpoint_url: AnyUrl | None = Field(
3570
+ None,
3571
+ description='Custom endpoint URL for MinIO/R2/B2/Garage and other S3-compatible providers',
3572
+ )
3573
+ request_timeout_seconds: float | None = Field(
3574
+ 30,
3575
+ description='Network timeout in seconds for storage list/read operations',
3576
+ ge=1.0,
3577
+ le=300.0,
3578
+ )
3579
+ max_keys_per_page: int | None = Field(
3580
+ 1000,
3581
+ description='Maximum objects requested per provider list API call during table discovery',
3582
+ ge=1,
3583
+ le=1000,
3584
+ )
3585
+ verify_ssl: bool | None = Field(
3586
+ True, description='TLS certificate verification toggle'
3587
+ )
3588
+
3589
+
3590
+ class IcebergOptionalScope(BaseModel):
3591
+ """
3592
+ Apache Iceberg table selection scope within the bucket.
3593
+ """
3594
+
3595
+ model_config = ConfigDict(
3596
+ extra='forbid',
3597
+ )
3598
+ prefix: str | None = Field(
3599
+ None,
3600
+ description='Key prefix to search for Apache Iceberg tables (e.g. warehouse/). Tables are auto-discovered by their metadata/ directory.',
3601
+ )
3602
+ table_paths: list[str] | None = Field(
3603
+ None,
3604
+ description='Explicit Apache Iceberg table root keys or s3:// URIs. When set, auto-discovery under prefix is skipped.',
3605
+ )
3606
+ table_limit: int | None = Field(
3607
+ None, description='Optional cap on number of table assets', ge=1
3608
+ )
3609
+
3610
+
3884
3611
  class YouTubeInput(CoreInput):
3885
3612
  type: Literal['YOUTUBE'] | None = Field(
3886
3613
  None, description='Type of the asset or source'
@@ -3899,6 +3626,58 @@ class YouTubeInput(CoreInput):
3899
3626
  resources: ResourceOverrides | None = None
3900
3627
 
3901
3628
 
3629
+ class DeltaLakeOptional(BaseModel):
3630
+ model_config = ConfigDict(
3631
+ extra='forbid',
3632
+ )
3633
+ connection: LakehouseStorageConnection | None = None
3634
+ scope: DeltaLakeOptionalScope | None = None
3635
+
3636
+
3637
+ class DeltaLakeInput(CoreInput):
3638
+ type: Literal['DELTA_LAKE'] | None = Field(
3639
+ None, description='Type of the asset or source'
3640
+ )
3641
+ required: DeltaLakeRequired
3642
+ masked: S3CompatibleStorageMasked | None = None
3643
+ optional: DeltaLakeOptional | None = None
3644
+ detectors: list[Detector] | None = Field(
3645
+ None, description='Detectors to run on ingested content'
3646
+ )
3647
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
3648
+ None,
3649
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
3650
+ )
3651
+ sampling: SamplingConfig
3652
+ resources: ResourceOverrides | None = None
3653
+
3654
+
3655
+ class IcebergOptional(BaseModel):
3656
+ model_config = ConfigDict(
3657
+ extra='forbid',
3658
+ )
3659
+ connection: LakehouseStorageConnection | None = None
3660
+ scope: IcebergOptionalScope | None = None
3661
+
3662
+
3663
+ class IcebergInput(CoreInput):
3664
+ type: Literal['ICEBERG'] | None = Field(
3665
+ None, description='Type of the asset or source'
3666
+ )
3667
+ required: IcebergRequired
3668
+ masked: S3CompatibleStorageMasked | None = None
3669
+ optional: IcebergOptional | None = None
3670
+ detectors: list[Detector] | None = Field(
3671
+ None, description='Detectors to run on ingested content'
3672
+ )
3673
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
3674
+ None,
3675
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
3676
+ )
3677
+ sampling: SamplingConfig
3678
+ resources: ResourceOverrides | None = None
3679
+
3680
+
3902
3681
  class SourceInput(
3903
3682
  RootModel[
3904
3683
  SlackInput
@@ -3926,8 +3705,6 @@ class SourceInput(
3926
3705
  | YouTubeInput
3927
3706
  | DeltaLakeInput
3928
3707
  | IcebergInput
3929
- | HudiInput
3930
- | SparkCatalogInput
3931
3708
  | KafkaInput
3932
3709
  | ElasticsearchInput
3933
3710
  | OpenSearchInput
@@ -3960,8 +3737,6 @@ class SourceInput(
3960
3737
  | YouTubeInput
3961
3738
  | DeltaLakeInput
3962
3739
  | IcebergInput
3963
- | HudiInput
3964
- | SparkCatalogInput
3965
3740
  | KafkaInput
3966
3741
  | ElasticsearchInput
3967
3742
  | OpenSearchInput