classifyre-cli 0.4.32__tar.gz → 0.4.34__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/.gitignore +3 -0
  2. classifyre_cli-0.4.34/.turbo/turbo-build.log +3 -0
  3. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/PKG-INFO +1 -1
  4. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/package.json +1 -1
  5. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/pyproject.toml +28 -1
  6. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/models/generated_input.py +493 -0
  7. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/outputs/rest.py +1 -3
  8. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/base.py +1 -3
  9. classifyre_cli-0.4.34/src/sources/delta_lake/__init__.py +3 -0
  10. classifyre_cli-0.4.34/src/sources/delta_lake/source.py +139 -0
  11. classifyre_cli-0.4.34/src/sources/hudi/__init__.py +3 -0
  12. classifyre_cli-0.4.34/src/sources/hudi/source.py +98 -0
  13. classifyre_cli-0.4.34/src/sources/iceberg/__init__.py +3 -0
  14. classifyre_cli-0.4.34/src/sources/iceberg/source.py +148 -0
  15. classifyre_cli-0.4.34/src/sources/kafka/__init__.py +3 -0
  16. classifyre_cli-0.4.34/src/sources/kafka/source.py +343 -0
  17. classifyre_cli-0.4.34/src/sources/spark_base.py +413 -0
  18. classifyre_cli-0.4.34/src/sources/spark_catalog/__init__.py +3 -0
  19. classifyre_cli-0.4.34/src/sources/spark_catalog/source.py +85 -0
  20. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/tabular_base.py +4 -1
  21. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/utils/dependency_groups.py +5 -0
  22. classifyre_cli-0.4.34/src/utils/spark_runtime.py +56 -0
  23. classifyre_cli-0.4.34/tests/_spark_fakes.py +125 -0
  24. classifyre_cli-0.4.34/tests/test_delta_lake_source.py +96 -0
  25. classifyre_cli-0.4.34/tests/test_hudi_source.py +72 -0
  26. classifyre_cli-0.4.34/tests/test_iceberg_source.py +95 -0
  27. classifyre_cli-0.4.34/tests/test_kafka_source.py +141 -0
  28. classifyre_cli-0.4.34/tests/test_spark_catalog_source.py +71 -0
  29. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/uv.lock +780 -743
  30. classifyre_cli-0.4.32/.turbo/turbo-build.log +0 -3
  31. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/.python-version +0 -0
  32. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/README.md +0 -0
  33. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/main.py +0 -0
  34. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/scripts/generate_models.py +0 -0
  35. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/__init__.py +0 -0
  36. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/config.py +0 -0
  37. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/__init__.py +0 -0
  38. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/base.py +0 -0
  39. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/broken_links/__init__.py +0 -0
  40. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/broken_links/detector.py +0 -0
  41. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/config.py +0 -0
  42. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/content/__init__.py +0 -0
  43. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/__init__.py +0 -0
  44. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/detector.py +0 -0
  45. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/extractor.py +0 -0
  46. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/runners/__init__.py +0 -0
  47. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_base.py +0 -0
  48. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_factory.py +0 -0
  49. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  50. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_gliner2.py +0 -0
  51. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_image_classification.py +0 -0
  52. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_llm.py +0 -0
  53. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_object_detection.py +0 -0
  54. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_regex.py +0 -0
  55. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_text_classification.py +0 -0
  56. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/custom/trainer.py +0 -0
  57. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/dependencies.py +0 -0
  58. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/pii/__init__.py +0 -0
  59. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/pii/detector.py +0 -0
  60. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/secrets/__init__.py +0 -0
  61. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/secrets/detector.py +0 -0
  62. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/threat/__init__.py +0 -0
  63. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/threat/code_security_detector.py +0 -0
  64. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/detectors/threat/yara_detector.py +0 -0
  65. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/main.py +0 -0
  66. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/models/generated_detectors.py +0 -0
  67. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/models/generated_single_asset_scan_results.py +0 -0
  68. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/outputs/__init__.py +0 -0
  69. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/outputs/base.py +0 -0
  70. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/outputs/console.py +0 -0
  71. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/outputs/factory.py +0 -0
  72. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/outputs/file.py +0 -0
  73. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/pipeline/__init__.py +0 -0
  74. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/pipeline/content_provider.py +0 -0
  75. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/pipeline/detector_pipeline.py +0 -0
  76. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/pipeline/parsed_content_provider.py +0 -0
  77. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/pipeline/worker_pool.py +0 -0
  78. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sandbox/__init__.py +0 -0
  79. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sandbox/runner.py +0 -0
  80. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/__init__.py +0 -0
  81. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/asset_metadata.py +0 -0
  82. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/atlassian_common.py +0 -0
  83. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/azure_blob_storage/__init__.py +0 -0
  84. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/azure_blob_storage/source.py +0 -0
  85. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/confluence/__init__.py +0 -0
  86. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/confluence/source.py +0 -0
  87. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/databricks/__init__.py +0 -0
  88. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/databricks/source.py +0 -0
  89. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/dependencies.py +0 -0
  90. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/email/__init__.py +0 -0
  91. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/email/source.py +0 -0
  92. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/google_cloud_storage/__init__.py +0 -0
  93. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/google_cloud_storage/source.py +0 -0
  94. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/hive/__init__.py +0 -0
  95. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/hive/source.py +0 -0
  96. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/jira/__init__.py +0 -0
  97. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/jira/source.py +0 -0
  98. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/mongodb/__init__.py +0 -0
  99. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/mongodb/source.py +0 -0
  100. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/mssql/__init__.py +0 -0
  101. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/mssql/source.py +0 -0
  102. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/mysql/__init__.py +0 -0
  103. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/mysql/source.py +0 -0
  104. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/neo4j/__init__.py +0 -0
  105. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/neo4j/source.py +0 -0
  106. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/notion/__init__.py +0 -0
  107. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/notion/client.py +0 -0
  108. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/notion/source.py +0 -0
  109. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/object_storage/base.py +0 -0
  110. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/oracle/__init__.py +0 -0
  111. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/oracle/source.py +0 -0
  112. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/postgresql/__init__.py +0 -0
  113. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/postgresql/source.py +0 -0
  114. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/powerbi/__init__.py +0 -0
  115. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/powerbi/source.py +0 -0
  116. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/recipe_normalizer.py +0 -0
  117. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/s3_compatible_storage/README.md +0 -0
  118. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/s3_compatible_storage/__init__.py +0 -0
  119. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/s3_compatible_storage/source.py +0 -0
  120. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/servicedesk/__init__.py +0 -0
  121. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/servicedesk/source.py +0 -0
  122. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/slack/__init__.py +0 -0
  123. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/slack/source.py +0 -0
  124. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/snowflake/__init__.py +0 -0
  125. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/snowflake/source.py +0 -0
  126. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/sqlite/__init__.py +0 -0
  127. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/sqlite/source.py +0 -0
  128. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/tableau/__init__.py +0 -0
  129. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/tableau/source.py +0 -0
  130. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/tabular_utils.py +0 -0
  131. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/wordpress/__init__.py +0 -0
  132. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/wordpress/source.py +0 -0
  133. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/youtube/__init__.py +0 -0
  134. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/sources/youtube/source.py +0 -0
  135. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/telemetry.py +0 -0
  136. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/utils/__init__.py +0 -0
  137. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/utils/content_extraction.py +0 -0
  138. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/utils/embedded_images.py +0 -0
  139. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/utils/file_metadata.py +0 -0
  140. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/utils/file_parser.py +0 -0
  141. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/utils/file_to_images.py +0 -0
  142. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/utils/hashing.py +0 -0
  143. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/utils/resources.py +0 -0
  144. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/utils/transcription.py +0 -0
  145. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/utils/uv_sync.py +0 -0
  146. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/src/utils/validation.py +0 -0
  147. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/__init__.py +0 -0
  148. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/conftest.py +0 -0
  149. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/__init__.py +0 -0
  150. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  151. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/conftest.py +0 -0
  152. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/content/__init__.py +0 -0
  153. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/custom/__init__.py +0 -0
  154. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/custom/conftest.py +0 -0
  155. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  156. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/custom/test_llm_runner.py +0 -0
  157. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  158. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/custom/test_regex_runner.py +0 -0
  159. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/custom/test_transformer_runners.py +0 -0
  160. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/pii/__init__.py +0 -0
  161. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/pii/conftest.py +0 -0
  162. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/pii/sample_invoice.pdf +0 -0
  163. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/pii/test_pii_detector.py +0 -0
  164. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  165. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/secrets/__init__.py +0 -0
  166. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  167. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  168. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/test_base_detector.py +0 -0
  169. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  170. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  171. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/test_detector_pipeline_types.py +0 -0
  172. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/test_detector_schema_examples.py +0 -0
  173. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/test_detector_types.py +0 -0
  174. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/test_phase2_detectors.py +0 -0
  175. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/test_registry.py +0 -0
  176. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/threat/__init__.py +0 -0
  177. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/threat/test_code_security_detector.py +0 -0
  178. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/detectors/threat/test_yara_detector.py +0 -0
  179. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  180. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/integration/test_wordpress_links_assets.py +0 -0
  181. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/pipeline/test_detector_pipeline.py +0 -0
  182. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/pipeline/test_worker_pool.py +0 -0
  183. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_assets_metadata_catalog.py +0 -0
  184. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_azure_blob_storage_source.py +0 -0
  185. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_base_source_attachment.py +0 -0
  186. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_base_source_sampling.py +0 -0
  187. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_config.py +0 -0
  188. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_confluence_source.py +0 -0
  189. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_custom_extractor.py +0 -0
  190. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_databricks_source.py +0 -0
  191. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_dependency_groups.py +0 -0
  192. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_email_source.py +0 -0
  193. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_google_cloud_storage_source.py +0 -0
  194. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_hashing.py +0 -0
  195. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_hive_source.py +0 -0
  196. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_jira_source.py +0 -0
  197. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_mongodb_source.py +0 -0
  198. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_mssql_source.py +0 -0
  199. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_mysql_source.py +0 -0
  200. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_neo4j_source.py +0 -0
  201. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_notion_source.py +0 -0
  202. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_oracle_source.py +0 -0
  203. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_outputs.py +0 -0
  204. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_postgresql_source.py +0 -0
  205. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_powerbi_source.py +0 -0
  206. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_recipe_normalizer.py +0 -0
  207. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_s3_compatible_storage_source.py +0 -0
  208. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_sampling_automatic.py +0 -0
  209. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_sandbox_runner.py +0 -0
  210. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_servicedesk_source.py +0 -0
  211. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_slack_source.py +0 -0
  212. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_snowflake_source.py +0 -0
  213. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_source_dependency_groups.py +0 -0
  214. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_sqlite_source.py +0 -0
  215. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_tableau_source.py +0 -0
  216. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_tabular_automatic_sampling.py +0 -0
  217. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_tabular_utils.py +0 -0
  218. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_uv_sync.py +0 -0
  219. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_wordpress_source.py +0 -0
  220. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_youtube_source.py +0 -0
  221. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/test_youtube_source_integration.py +0 -0
  222. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/utils/test_content_extraction.py +0 -0
  223. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/utils/test_embedded_images.py +0 -0
  224. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/utils/test_file_metadata.py +0 -0
  225. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/utils/test_file_parser.py +0 -0
  226. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/utils/test_file_to_images.py +0 -0
  227. {classifyre_cli-0.4.32 → classifyre_cli-0.4.34}/tests/utils/test_transcription.py +0 -0
@@ -63,3 +63,6 @@ dmypy.json
63
63
 
64
64
  # Local training artifacts
65
65
  checkpoints/
66
+
67
+ # Spark local warehouse (created during dev/testing of lakehouse sources)
68
+ spark-warehouse/
@@ -0,0 +1,3 @@
1
+ $ uv sync
2
+ Resolved 268 packages in 168ms
3
+ Checked 50 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.32
3
+ Version: 0.4.34
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.32",
3
+ "version": "0.4.34",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.32"
3
+ version = "0.4.34"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -202,6 +202,33 @@ youtube = [
202
202
  "yt-dlp>=2025.1.0",
203
203
  "youtube-transcript-api>=1.0.0",
204
204
  ]
205
+ spark = [
206
+ # Shared PySpark runtime for the lakehouse sources. Requires a JDK (Java 21
207
+ # LTS) on the host; format JARs (Delta/Iceberg/Hudi) are resolved at runtime
208
+ # via spark.jars.packages (see SPARK_* env vars). Pinned to the Spark 4.1
209
+ # minor so the format-JAR coordinates below stay version-matched.
210
+ "pyspark>=4.1,<4.2",
211
+ ]
212
+ delta-lake = [
213
+ { include-group = "spark" },
214
+ "delta-spark>=4.0",
215
+ ]
216
+ hudi = [
217
+ # Hudi Spark integration ships as Maven JARs configured via
218
+ # SPARK_JARS_PACKAGES; only the shared PySpark runtime is needed here.
219
+ { include-group = "spark" },
220
+ ]
221
+ spark-catalog = [
222
+ { include-group = "spark" },
223
+ ]
224
+ iceberg = [
225
+ # Inspected through Spark (iceberg-spark-runtime JARs via SPARK_JARS_PACKAGES),
226
+ # sharing the JVM the other lakehouse sources already require.
227
+ { include-group = "spark" },
228
+ ]
229
+ kafka = [
230
+ "kafka-python>=2.0",
231
+ ]
205
232
  otel = [
206
233
  "opentelemetry-sdk>=1.42.0",
207
234
  "opentelemetry-exporter-otlp-proto-http>=1.27.0",
@@ -45,6 +45,11 @@ class AssetType(StrEnum):
45
45
  NOTION = 'NOTION'
46
46
  EMAIL = 'EMAIL'
47
47
  YOUTUBE = 'YOUTUBE'
48
+ DELTA_LAKE = 'DELTA_LAKE'
49
+ ICEBERG = 'ICEBERG'
50
+ HUDI = 'HUDI'
51
+ SPARK_CATALOG = 'SPARK_CATALOG'
52
+ KAFKA = 'KAFKA'
48
53
 
49
54
 
50
55
  class DetectorType(StrEnum):
@@ -343,6 +348,11 @@ class Type(StrEnum):
343
348
  NOTION = 'NOTION'
344
349
  EMAIL = 'EMAIL'
345
350
  YOUTUBE = 'YOUTUBE'
351
+ DELTA_LAKE = 'DELTA_LAKE'
352
+ ICEBERG = 'ICEBERG'
353
+ HUDI = 'HUDI'
354
+ SPARK_CATALOG = 'SPARK_CATALOG'
355
+ KAFKA = 'KAFKA'
346
356
 
347
357
 
348
358
  class YouTubeRequired(BaseModel):
@@ -2850,6 +2860,11 @@ class Type19(StrEnum):
2850
2860
  NOTION = 'NOTION'
2851
2861
  EMAIL = 'EMAIL'
2852
2862
  YOUTUBE = 'YOUTUBE'
2863
+ DELTA_LAKE = 'DELTA_LAKE'
2864
+ ICEBERG = 'ICEBERG'
2865
+ HUDI = 'HUDI'
2866
+ SPARK_CATALOG = 'SPARK_CATALOG'
2867
+ KAFKA = 'KAFKA'
2853
2868
 
2854
2869
 
2855
2870
  class ConfluenceInput(CoreInput):
@@ -3093,6 +3108,474 @@ class NotionInput(CoreInput):
3093
3108
  resources: ResourceOverrides | None = None
3094
3109
 
3095
3110
 
3111
+ class IcebergCatalogType(StrEnum):
3112
+ """
3113
+ PyIceberg catalog backend type
3114
+ """
3115
+
3116
+ REST = 'REST'
3117
+ HIVE = 'HIVE'
3118
+ GLUE = 'GLUE'
3119
+ SQL = 'SQL'
3120
+
3121
+
3122
+ class KafkaSecurityProtocol(StrEnum):
3123
+ """
3124
+ Kafka client security protocol
3125
+ """
3126
+
3127
+ PLAINTEXT = 'PLAINTEXT'
3128
+ SSL = 'SSL'
3129
+ SASL_PLAINTEXT = 'SASL_PLAINTEXT'
3130
+ SASL_SSL = 'SASL_SSL'
3131
+
3132
+
3133
+ class KafkaSaslMechanism(StrEnum):
3134
+ """
3135
+ SASL mechanism used when security_protocol is SASL_*
3136
+ """
3137
+
3138
+ PLAIN = 'PLAIN'
3139
+ SCRAM_SHA_256 = 'SCRAM-SHA-256'
3140
+ SCRAM_SHA_512 = 'SCRAM-SHA-512'
3141
+
3142
+
3143
+ class DeltaLakeRequired(BaseModel):
3144
+ model_config = ConfigDict(
3145
+ extra='forbid',
3146
+ )
3147
+ warehouse_path: str = Field(
3148
+ ...,
3149
+ description='Root storage location holding Delta tables (e.g. s3a://lake/warehouse, file:///data)',
3150
+ )
3151
+
3152
+
3153
+ class DeltaLakeMasked(BaseModel):
3154
+ """
3155
+ Optional object-store credentials for the warehouse location.
3156
+ """
3157
+
3158
+ model_config = ConfigDict(
3159
+ extra='forbid',
3160
+ )
3161
+ s3_access_key_id: str | None = Field(
3162
+ None, description='S3 access key id for object-store warehouses'
3163
+ )
3164
+ s3_secret_access_key: str | None = Field(None, description='S3 secret access key')
3165
+ s3_session_token: str | None = Field(None, description='Optional S3 session token')
3166
+
3167
+
3168
+ class DeltaLakeOptionalConnection(BaseModel):
3169
+ """
3170
+ Delta Lake connection and storage options.
3171
+ """
3172
+
3173
+ model_config = ConfigDict(
3174
+ extra='forbid',
3175
+ )
3176
+ metastore_uri: str | None = Field(
3177
+ None,
3178
+ description='Hive Metastore thrift URI; enables catalog-based table discovery',
3179
+ )
3180
+ endpoint_url: str | None = Field(
3181
+ None, description='Custom S3-compatible endpoint URL'
3182
+ )
3183
+ region: str | None = Field(None, description='Object-store region')
3184
+
3185
+
3186
+ class DeltaLakeOptionalScope(BaseModel):
3187
+ """
3188
+ Delta Lake database and table selection scope.
3189
+ """
3190
+
3191
+ model_config = ConfigDict(
3192
+ extra='forbid',
3193
+ )
3194
+ database: str | None = Field(
3195
+ None, description='Single database/namespace to scan (catalog mode)'
3196
+ )
3197
+ include_all_databases: bool | None = Field(
3198
+ False, description='Scan all visible databases except excluded system databases'
3199
+ )
3200
+ exclude_databases: list[str] | None = Field(
3201
+ ['information_schema', 'sys'], description='Database denylist (exact names)'
3202
+ )
3203
+ include_tables: list[str] | None = Field(
3204
+ None,
3205
+ description='Optional table allowlist. Accepted forms: table or database.table',
3206
+ )
3207
+ table_limit: int | None = Field(
3208
+ None, description='Optional cap on number of table assets per database', ge=1
3209
+ )
3210
+ table_paths: list[str] | None = Field(
3211
+ None,
3212
+ description='Explicit Delta table locations to scan when no metastore is configured',
3213
+ )
3214
+
3215
+
3216
+ class DeltaLakeOptional(BaseModel):
3217
+ model_config = ConfigDict(
3218
+ extra='forbid',
3219
+ )
3220
+ connection: DeltaLakeOptionalConnection | None = None
3221
+ scope: DeltaLakeOptionalScope | None = None
3222
+
3223
+
3224
+ class DeltaLakeInput(CoreInput):
3225
+ type: Literal['DELTA_LAKE'] | None = Field(
3226
+ None, description='Type of the asset or source'
3227
+ )
3228
+ required: DeltaLakeRequired
3229
+ masked: DeltaLakeMasked | None = None
3230
+ optional: DeltaLakeOptional | None = None
3231
+ detectors: list[Detector] | None = Field(
3232
+ None, description='Detectors to run on ingested content'
3233
+ )
3234
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
3235
+ None,
3236
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
3237
+ )
3238
+ sampling: SamplingConfig
3239
+ resources: ResourceOverrides | None = None
3240
+
3241
+
3242
+ class HudiRequired(BaseModel):
3243
+ model_config = ConfigDict(
3244
+ extra='forbid',
3245
+ )
3246
+ warehouse_path: str = Field(
3247
+ ..., description='Root storage location holding Hudi tables'
3248
+ )
3249
+
3250
+
3251
+ class HudiMasked(BaseModel):
3252
+ """
3253
+ Optional object-store credentials for the warehouse location.
3254
+ """
3255
+
3256
+ model_config = ConfigDict(
3257
+ extra='forbid',
3258
+ )
3259
+ s3_access_key_id: str | None = Field(
3260
+ None, description='S3 access key id for object-store warehouses'
3261
+ )
3262
+ s3_secret_access_key: str | None = Field(None, description='S3 secret access key')
3263
+ s3_session_token: str | None = Field(None, description='Optional S3 session token')
3264
+
3265
+
3266
+ class HudiOptionalConnection(BaseModel):
3267
+ """
3268
+ Hudi connection and storage options.
3269
+ """
3270
+
3271
+ model_config = ConfigDict(
3272
+ extra='forbid',
3273
+ )
3274
+ metastore_uri: str | None = Field(
3275
+ None,
3276
+ description='Hive Metastore thrift URI; enables catalog-based table discovery',
3277
+ )
3278
+ endpoint_url: str | None = Field(
3279
+ None, description='Custom S3-compatible endpoint URL'
3280
+ )
3281
+ region: str | None = Field(None, description='Object-store region')
3282
+
3283
+
3284
+ class HudiOptionalScope(BaseModel):
3285
+ """
3286
+ Hudi database and table selection scope.
3287
+ """
3288
+
3289
+ model_config = ConfigDict(
3290
+ extra='forbid',
3291
+ )
3292
+ database: str | None = Field(
3293
+ None, description='Single database/namespace to scan (catalog mode)'
3294
+ )
3295
+ include_all_databases: bool | None = Field(
3296
+ False, description='Scan all visible databases except excluded system databases'
3297
+ )
3298
+ exclude_databases: list[str] | None = Field(
3299
+ ['information_schema', 'sys'], description='Database denylist (exact names)'
3300
+ )
3301
+ include_tables: list[str] | None = Field(
3302
+ None,
3303
+ description='Optional table allowlist. Accepted forms: table or database.table',
3304
+ )
3305
+ table_limit: int | None = Field(
3306
+ None, description='Optional cap on number of table assets per database', ge=1
3307
+ )
3308
+ table_paths: list[str] | None = Field(
3309
+ None,
3310
+ description='Explicit Hudi table locations to scan when no metastore is configured',
3311
+ )
3312
+
3313
+
3314
+ class HudiOptional(BaseModel):
3315
+ model_config = ConfigDict(
3316
+ extra='forbid',
3317
+ )
3318
+ connection: HudiOptionalConnection | None = None
3319
+ scope: HudiOptionalScope | None = None
3320
+
3321
+
3322
+ class HudiInput(CoreInput):
3323
+ type: Literal['HUDI'] | None = Field(
3324
+ None, description='Type of the asset or source'
3325
+ )
3326
+ required: HudiRequired
3327
+ masked: HudiMasked | None = None
3328
+ optional: HudiOptional | None = None
3329
+ detectors: list[Detector] | None = Field(
3330
+ None, description='Detectors to run on ingested content'
3331
+ )
3332
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
3333
+ None,
3334
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
3335
+ )
3336
+ sampling: SamplingConfig
3337
+ resources: ResourceOverrides | None = None
3338
+
3339
+
3340
+ class SparkCatalogRequired(BaseModel):
3341
+ model_config = ConfigDict(
3342
+ extra='forbid',
3343
+ )
3344
+ connect_url: str = Field(
3345
+ ...,
3346
+ description='Spark Connect endpoint (sc://host:15002) or classic master (spark://host:7077)',
3347
+ )
3348
+
3349
+
3350
+ class SparkCatalogMasked(BaseModel):
3351
+ """
3352
+ Optional Spark Connect authentication.
3353
+ """
3354
+
3355
+ model_config = ConfigDict(
3356
+ extra='forbid',
3357
+ )
3358
+ token: str | None = Field(
3359
+ None, description='Bearer token for Spark Connect authentication'
3360
+ )
3361
+
3362
+
3363
+ class SparkCatalogOptionalScope(BaseModel):
3364
+ """
3365
+ Spark catalog and table selection scope.
3366
+ """
3367
+
3368
+ model_config = ConfigDict(
3369
+ extra='forbid',
3370
+ )
3371
+ catalog: str | None = Field(
3372
+ None, description='Spark catalog name to scan (defaults to the session catalog)'
3373
+ )
3374
+ database: str | None = Field(None, description='Single database/namespace to scan')
3375
+ include_all_databases: bool | None = Field(
3376
+ False, description='Scan all visible databases except excluded system databases'
3377
+ )
3378
+ exclude_databases: list[str] | None = Field(
3379
+ ['information_schema', 'sys'], description='Database denylist (exact names)'
3380
+ )
3381
+ include_tables: list[str] | None = Field(
3382
+ None,
3383
+ description='Optional table allowlist. Accepted forms: table or database.table',
3384
+ )
3385
+ table_limit: int | None = Field(
3386
+ None, description='Optional cap on number of table assets per database', ge=1
3387
+ )
3388
+
3389
+
3390
+ class SparkCatalogOptional(BaseModel):
3391
+ model_config = ConfigDict(
3392
+ extra='forbid',
3393
+ )
3394
+ scope: SparkCatalogOptionalScope | None = None
3395
+
3396
+
3397
+ class SparkCatalogInput(CoreInput):
3398
+ type: Literal['SPARK_CATALOG'] | None = Field(
3399
+ None, description='Type of the asset or source'
3400
+ )
3401
+ required: SparkCatalogRequired
3402
+ masked: SparkCatalogMasked | None = None
3403
+ optional: SparkCatalogOptional | None = None
3404
+ detectors: list[Detector] | None = Field(
3405
+ None, description='Detectors to run on ingested content'
3406
+ )
3407
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
3408
+ None,
3409
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
3410
+ )
3411
+ sampling: SamplingConfig
3412
+ resources: ResourceOverrides | None = None
3413
+
3414
+
3415
+ class IcebergRequired(BaseModel):
3416
+ model_config = ConfigDict(
3417
+ extra='forbid',
3418
+ )
3419
+ catalog_type: IcebergCatalogType
3420
+ catalog_uri: str | None = Field(
3421
+ None,
3422
+ description='Catalog URI (REST endpoint, Hive metastore thrift URI, or SQL DSN). Not required for GLUE.',
3423
+ )
3424
+ warehouse: str = Field(
3425
+ ..., description='Warehouse location root (e.g. s3://bucket/warehouse)'
3426
+ )
3427
+
3428
+
3429
+ class IcebergMasked(BaseModel):
3430
+ """
3431
+ Optional Iceberg catalog/storage credentials.
3432
+ """
3433
+
3434
+ model_config = ConfigDict(
3435
+ extra='forbid',
3436
+ )
3437
+ token: str | None = Field(None, description='Bearer token for a REST catalog')
3438
+ aws_access_key_id: str | None = Field(
3439
+ None, description='AWS access key id (Glue/S3)'
3440
+ )
3441
+ aws_secret_access_key: str | None = Field(
3442
+ None, description='AWS secret access key (Glue/S3)'
3443
+ )
3444
+
3445
+
3446
+ class IcebergOptionalScope(BaseModel):
3447
+ """
3448
+ Iceberg namespace and table selection scope.
3449
+ """
3450
+
3451
+ model_config = ConfigDict(
3452
+ extra='forbid',
3453
+ )
3454
+ namespace: str | None = Field(
3455
+ None, description='Single namespace to scan (dotted form supported)'
3456
+ )
3457
+ include_all_namespaces: bool | None = Field(
3458
+ False, description='Scan all visible namespaces'
3459
+ )
3460
+ include_tables: list[str] | None = Field(
3461
+ None,
3462
+ description='Optional table allowlist. Accepted forms: table or namespace.table',
3463
+ )
3464
+ table_limit: int | None = Field(
3465
+ None, description='Optional cap on number of table assets per namespace', ge=1
3466
+ )
3467
+
3468
+
3469
+ class IcebergOptional(BaseModel):
3470
+ model_config = ConfigDict(
3471
+ extra='forbid',
3472
+ )
3473
+ scope: IcebergOptionalScope | None = None
3474
+
3475
+
3476
+ class IcebergInput(CoreInput):
3477
+ type: Literal['ICEBERG'] | None = Field(
3478
+ None, description='Type of the asset or source'
3479
+ )
3480
+ required: IcebergRequired
3481
+ masked: IcebergMasked | None = None
3482
+ optional: IcebergOptional | None = None
3483
+ detectors: list[Detector] | None = Field(
3484
+ None, description='Detectors to run on ingested content'
3485
+ )
3486
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
3487
+ None,
3488
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
3489
+ )
3490
+ sampling: SamplingConfig
3491
+ resources: ResourceOverrides | None = None
3492
+
3493
+
3494
+ class KafkaRequired(BaseModel):
3495
+ model_config = ConfigDict(
3496
+ extra='forbid',
3497
+ )
3498
+ bootstrap_servers: str = Field(
3499
+ ..., description='Comma-separated Kafka bootstrap servers (host:port)'
3500
+ )
3501
+
3502
+
3503
+ class KafkaMasked(BaseModel):
3504
+ """
3505
+ Optional SASL credentials.
3506
+ """
3507
+
3508
+ model_config = ConfigDict(
3509
+ extra='forbid',
3510
+ )
3511
+ sasl_username: str | None = Field(None, description='SASL username')
3512
+ sasl_password: str | None = Field(None, description='SASL password')
3513
+
3514
+
3515
+ class KafkaOptionalConnection(BaseModel):
3516
+ """
3517
+ Kafka client connection and security options.
3518
+ """
3519
+
3520
+ model_config = ConfigDict(
3521
+ extra='forbid',
3522
+ )
3523
+ security_protocol: KafkaSecurityProtocol | None = 'PLAINTEXT'
3524
+ sasl_mechanism: KafkaSaslMechanism | None = 'PLAIN'
3525
+ ssl_ca: str | None = Field(
3526
+ None, description='PEM-encoded CA certificate for TLS verification'
3527
+ )
3528
+ request_timeout_ms: int | None = Field(
3529
+ 30000, description='Client request timeout in milliseconds', ge=1000
3530
+ )
3531
+
3532
+
3533
+ class KafkaOptionalScope(BaseModel):
3534
+ """
3535
+ Kafka topic selection scope.
3536
+ """
3537
+
3538
+ model_config = ConfigDict(
3539
+ extra='forbid',
3540
+ )
3541
+ include_topics: list[str] | None = Field(
3542
+ None, description='Optional topic allowlist'
3543
+ )
3544
+ exclude_topics: list[str] | None = Field(None, description='Topic denylist')
3545
+ include_internal: bool | None = Field(
3546
+ False, description='Include internal topics (names starting with __)'
3547
+ )
3548
+ topic_limit: int | None = Field(
3549
+ None, description='Optional cap on number of topic assets', ge=1
3550
+ )
3551
+
3552
+
3553
+ class KafkaOptional(BaseModel):
3554
+ model_config = ConfigDict(
3555
+ extra='forbid',
3556
+ )
3557
+ connection: KafkaOptionalConnection | None = None
3558
+ scope: KafkaOptionalScope | None = None
3559
+
3560
+
3561
+ class KafkaInput(CoreInput):
3562
+ type: Literal['KAFKA'] | None = Field(
3563
+ None, description='Type of the asset or source'
3564
+ )
3565
+ required: KafkaRequired
3566
+ masked: KafkaMasked | None = None
3567
+ optional: KafkaOptional | None = None
3568
+ detectors: list[Detector] | None = Field(
3569
+ None, description='Detectors to run on ingested content'
3570
+ )
3571
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
3572
+ None,
3573
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
3574
+ )
3575
+ sampling: SamplingConfig
3576
+ resources: ResourceOverrides | None = None
3577
+
3578
+
3096
3579
  class YouTubeInput(CoreInput):
3097
3580
  type: Literal['YOUTUBE'] | None = Field(
3098
3581
  None, description='Type of the asset or source'
@@ -3136,6 +3619,11 @@ class SourceInput(
3136
3619
  | NotionInput
3137
3620
  | EmailInput
3138
3621
  | YouTubeInput
3622
+ | DeltaLakeInput
3623
+ | IcebergInput
3624
+ | HudiInput
3625
+ | SparkCatalogInput
3626
+ | KafkaInput
3139
3627
  ]
3140
3628
  ):
3141
3629
  root: (
@@ -3162,6 +3650,11 @@ class SourceInput(
3162
3650
  | NotionInput
3163
3651
  | EmailInput
3164
3652
  | YouTubeInput
3653
+ | DeltaLakeInput
3654
+ | IcebergInput
3655
+ | HudiInput
3656
+ | SparkCatalogInput
3657
+ | KafkaInput
3165
3658
  ) = Field(
3166
3659
  ...,
3167
3660
  description='Merged configuration schema with all source types and common definitions',
@@ -129,9 +129,7 @@ class FinalizeIngestRunRequest(BaseModel):
129
129
  seen_hashes: list[str] = Field(serialization_alias="seenHashes")
130
130
  # AUTOMATIC sampling cursor to persist on the source for the next run.
131
131
  # Omitted (None) for other strategies so the stored cursor is left untouched.
132
- sampling_cursor: dict[str, Any] | None = Field(
133
- None, serialization_alias="samplingCursor"
134
- )
132
+ sampling_cursor: dict[str, Any] | None = Field(None, serialization_alias="samplingCursor")
135
133
 
136
134
 
137
135
  class UpdateRunnerStatusRequest(BaseModel):
@@ -133,9 +133,7 @@ class BaseSource(ABC):
133
133
  saved = self._sampling_cursor.get(key)
134
134
  return saved if isinstance(saved, int) and saved >= 0 else 0
135
135
 
136
- def record_automatic_offset(
137
- self, key: str, *, prev_offset: int, fetched: int
138
- ) -> None:
136
+ def record_automatic_offset(self, key: str, *, prev_offset: int, fetched: int) -> None:
139
137
  """Advance a keyed offset cursor; wrap to 0 once a page underfills.
140
138
 
141
139
  Used by sources that page rows directly from the backing store
@@ -0,0 +1,3 @@
1
+ from .source import DeltaLakeSource
2
+
3
+ __all__ = ["DeltaLakeSource"]