classifyre-cli 0.4.35__tar.gz → 0.4.37__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.37/.turbo/turbo-build.log +3 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/PKG-INFO +1 -1
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/package.json +1 -1
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/pyproject.toml +24 -27
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/models/generated_input.py +116 -341
- classifyre_cli-0.4.37/src/sources/delta_lake/source.py +146 -0
- classifyre_cli-0.4.37/src/sources/iceberg/source.py +155 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/kafka/source.py +161 -100
- classifyre_cli-0.4.37/src/sources/lakehouse_base.py +407 -0
- classifyre_cli-0.4.37/src/sources/s3_client.py +72 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/s3_compatible_storage/source.py +9 -40
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/utils/dependency_groups.py +0 -2
- classifyre_cli-0.4.37/tests/_lakehouse_fakes.py +41 -0
- classifyre_cli-0.4.37/tests/test_delta_lake_source.py +165 -0
- classifyre_cli-0.4.37/tests/test_iceberg_source.py +175 -0
- classifyre_cli-0.4.37/tests/test_kafka_source.py +251 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_source_dependency_groups.py +5 -3
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/uv.lock +482 -231
- classifyre_cli-0.4.35/.turbo/turbo-build.log +0 -3
- classifyre_cli-0.4.35/src/sources/delta_lake/source.py +0 -139
- classifyre_cli-0.4.35/src/sources/hudi/__init__.py +0 -3
- classifyre_cli-0.4.35/src/sources/hudi/source.py +0 -98
- classifyre_cli-0.4.35/src/sources/iceberg/source.py +0 -148
- classifyre_cli-0.4.35/src/sources/spark_base.py +0 -413
- classifyre_cli-0.4.35/src/sources/spark_catalog/__init__.py +0 -3
- classifyre_cli-0.4.35/src/sources/spark_catalog/source.py +0 -93
- classifyre_cli-0.4.35/src/utils/spark_runtime.py +0 -56
- classifyre_cli-0.4.35/tests/_spark_fakes.py +0 -125
- classifyre_cli-0.4.35/tests/test_delta_lake_source.py +0 -96
- classifyre_cli-0.4.35/tests/test_hudi_source.py +0 -72
- classifyre_cli-0.4.35/tests/test_iceberg_source.py +0 -95
- classifyre_cli-0.4.35/tests/test_kafka_source.py +0 -192
- classifyre_cli-0.4.35/tests/test_spark_catalog_source.py +0 -71
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/.gitignore +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/.python-version +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/README.md +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/main.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/scripts/generate_models.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/config.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/broken_links/detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/extractor.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_llm.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/runners/_text_classification.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/main.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/models/generated_detectors.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/models/generated_single_asset_scan_results.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/outputs/rest.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/pipeline/detector_pipeline.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/pipeline/worker_pool.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/asset_metadata.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/base.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/confluence/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/databricks/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/delta_lake/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/elasticsearch/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/elasticsearch/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/email/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/email/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/hive/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/iceberg/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/jira/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/kafka/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/meilisearch/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/meilisearch/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/mongodb/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/mssql/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/mysql/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/neo4j/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/neo4j/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/notion/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/notion/client.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/notion/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/object_storage/base.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/opensearch/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/opensearch/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/oracle/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/postgresql/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/powerbi/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/search_engine_base.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/servicedesk/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/slack/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/snowflake/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/sqlite/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/sqlite/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/tableau/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/tabular_base.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/tabular_utils.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/wordpress/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/youtube/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/sources/youtube/source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/utils/embedded_images.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/utils/file_metadata.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/utils/file_parser.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/utils/file_to_images.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/utils/resources.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/utils/transcription.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/custom/test_llm_runner.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/pipeline/test_detector_pipeline.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/pipeline/test_worker_pool.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_assets_metadata_catalog.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_config.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_custom_extractor.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_databricks_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_dependency_groups.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_elasticsearch_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_email_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_hive_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_meilisearch_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_mssql_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_mysql_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_notion_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_opensearch_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_oracle_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_outputs.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_postgresql_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_s3_compatible_storage_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_sampling_automatic.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_sandbox_runner.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_snowflake_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_sqlite_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_tabular_automatic_sampling.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_uv_sync.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_youtube_source.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/test_youtube_source_integration.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/utils/test_embedded_images.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/utils/test_file_metadata.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/utils/test_file_parser.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/utils/test_file_to_images.py +0 -0
- {classifyre_cli-0.4.35 → classifyre_cli-0.4.37}/tests/utils/test_transcription.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "classifyre-cli"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.37"
|
|
4
4
|
description = "Classifyre CLI — scan and classify unstructured data sources"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -51,6 +51,8 @@ dev = [
|
|
|
51
51
|
"datamodel-code-generator>=0.57.0",
|
|
52
52
|
"pytest>=9.0.3",
|
|
53
53
|
"pytest-asyncio>=0.24.0",
|
|
54
|
+
# Real SQL engine for the lakehouse source tests (sampling over Parquet).
|
|
55
|
+
"duckdb>=1.1.0",
|
|
54
56
|
]
|
|
55
57
|
privacy = [
|
|
56
58
|
"presidio-analyzer>=2.2.362",
|
|
@@ -134,6 +136,9 @@ file-processing = [
|
|
|
134
136
|
"openpyxl>=3.1.5",
|
|
135
137
|
"chardet>=7.4.3",
|
|
136
138
|
"pyarrow>=18.0.0",
|
|
139
|
+
# Table layer: Iceberg metadata resolution + SQL over Parquet without Spark.
|
|
140
|
+
"pyiceberg>=0.9.0",
|
|
141
|
+
"duckdb>=1.1.0",
|
|
137
142
|
]
|
|
138
143
|
transcription = [
|
|
139
144
|
# CPU audio/video transcription. faster-whisper decodes media via bundled
|
|
@@ -202,37 +207,23 @@ youtube = [
|
|
|
202
207
|
"yt-dlp>=2025.1.0",
|
|
203
208
|
"youtube-transcript-api>=1.0.0",
|
|
204
209
|
]
|
|
205
|
-
spark = [
|
|
206
|
-
# Shared PySpark runtime for the lakehouse sources. Requires a JDK (Java 21
|
|
207
|
-
# LTS) on the host; format JARs (Delta/Iceberg/Hudi) are resolved at runtime
|
|
208
|
-
# via spark.jars.packages (see SPARK_* env vars). Pinned to the Spark 4.1
|
|
209
|
-
# minor so the format-JAR coordinates below stay version-matched.
|
|
210
|
-
#
|
|
211
|
-
# The `connect` extra pulls in pandas>=2.2, pyarrow, and grpcio, which the
|
|
212
|
-
# Spark Connect client (Spark Catalog via sc:// URLs) hard-requires at session
|
|
213
|
-
# build time. Classic/local Spark (Delta/Hudi/Iceberg) does not need them, but
|
|
214
|
-
# they share this group, so we ship the client deps once for all of them.
|
|
215
|
-
"pyspark[connect]>=4.1,<4.2",
|
|
216
|
-
]
|
|
217
210
|
delta-lake = [
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
{ include-group = "spark" },
|
|
225
|
-
]
|
|
226
|
-
spark-catalog = [
|
|
227
|
-
{ include-group = "spark" },
|
|
211
|
+
# Pure-Python/Rust Delta reader (delta-rs) — no Spark, no JVM. Table metadata
|
|
212
|
+
# and data-file lists come from deltalake; rows are sampled through DuckDB
|
|
213
|
+
# (file-processing group) over the table's Parquet files.
|
|
214
|
+
{ include-group = "file-processing" },
|
|
215
|
+
"deltalake>=0.25.0",
|
|
216
|
+
"boto3>=1.43.10,<2.0.0",
|
|
228
217
|
]
|
|
229
218
|
iceberg = [
|
|
230
|
-
#
|
|
231
|
-
#
|
|
232
|
-
{ include-group = "
|
|
219
|
+
# PyIceberg resolves table metadata and finds Parquet files without Spark;
|
|
220
|
+
# DuckDB (file-processing group) samples rows from those files.
|
|
221
|
+
{ include-group = "file-processing" },
|
|
222
|
+
"boto3>=1.43.10,<2.0.0",
|
|
233
223
|
]
|
|
234
224
|
kafka = [
|
|
235
|
-
|
|
225
|
+
# librdkafka-based client (bundled wheels, no JVM).
|
|
226
|
+
"confluent-kafka>=2.5.0",
|
|
236
227
|
]
|
|
237
228
|
otel = [
|
|
238
229
|
"opentelemetry-sdk>=1.42.0",
|
|
@@ -353,6 +344,12 @@ module = [
|
|
|
353
344
|
"re2.*",
|
|
354
345
|
"faster_whisper",
|
|
355
346
|
"faster_whisper.*",
|
|
347
|
+
"pyiceberg.*",
|
|
348
|
+
"deltalake.*",
|
|
349
|
+
"duckdb.*",
|
|
350
|
+
"confluent_kafka.*",
|
|
351
|
+
"boto3.*",
|
|
352
|
+
"botocore.*",
|
|
356
353
|
]
|
|
357
354
|
ignore_missing_imports = true
|
|
358
355
|
|
|
@@ -47,8 +47,6 @@ class AssetType(StrEnum):
|
|
|
47
47
|
YOUTUBE = 'YOUTUBE'
|
|
48
48
|
DELTA_LAKE = 'DELTA_LAKE'
|
|
49
49
|
ICEBERG = 'ICEBERG'
|
|
50
|
-
HUDI = 'HUDI'
|
|
51
|
-
SPARK_CATALOG = 'SPARK_CATALOG'
|
|
52
50
|
KAFKA = 'KAFKA'
|
|
53
51
|
ELASTICSEARCH = 'ELASTICSEARCH'
|
|
54
52
|
OPENSEARCH = 'OPENSEARCH'
|
|
@@ -353,8 +351,6 @@ class Type(StrEnum):
|
|
|
353
351
|
YOUTUBE = 'YOUTUBE'
|
|
354
352
|
DELTA_LAKE = 'DELTA_LAKE'
|
|
355
353
|
ICEBERG = 'ICEBERG'
|
|
356
|
-
HUDI = 'HUDI'
|
|
357
|
-
SPARK_CATALOG = 'SPARK_CATALOG'
|
|
358
354
|
KAFKA = 'KAFKA'
|
|
359
355
|
ELASTICSEARCH = 'ELASTICSEARCH'
|
|
360
356
|
OPENSEARCH = 'OPENSEARCH'
|
|
@@ -2868,8 +2864,6 @@ class Type19(StrEnum):
|
|
|
2868
2864
|
YOUTUBE = 'YOUTUBE'
|
|
2869
2865
|
DELTA_LAKE = 'DELTA_LAKE'
|
|
2870
2866
|
ICEBERG = 'ICEBERG'
|
|
2871
|
-
HUDI = 'HUDI'
|
|
2872
|
-
SPARK_CATALOG = 'SPARK_CATALOG'
|
|
2873
2867
|
KAFKA = 'KAFKA'
|
|
2874
2868
|
ELASTICSEARCH = 'ELASTICSEARCH'
|
|
2875
2869
|
OPENSEARCH = 'OPENSEARCH'
|
|
@@ -3117,17 +3111,6 @@ class NotionInput(CoreInput):
|
|
|
3117
3111
|
resources: ResourceOverrides | None = None
|
|
3118
3112
|
|
|
3119
3113
|
|
|
3120
|
-
class IcebergCatalogType(StrEnum):
|
|
3121
|
-
"""
|
|
3122
|
-
PyIceberg catalog backend type
|
|
3123
|
-
"""
|
|
3124
|
-
|
|
3125
|
-
REST = 'REST'
|
|
3126
|
-
HIVE = 'HIVE'
|
|
3127
|
-
GLUE = 'GLUE'
|
|
3128
|
-
SQL = 'SQL'
|
|
3129
|
-
|
|
3130
|
-
|
|
3131
3114
|
class KafkaSecurityProtocol(StrEnum):
|
|
3132
3115
|
"""
|
|
3133
3116
|
Kafka client security protocol
|
|
@@ -3153,351 +3136,41 @@ class DeltaLakeRequired(BaseModel):
|
|
|
3153
3136
|
model_config = ConfigDict(
|
|
3154
3137
|
extra='forbid',
|
|
3155
3138
|
)
|
|
3156
|
-
|
|
3139
|
+
bucket: str = Field(
|
|
3157
3140
|
...,
|
|
3158
|
-
description='
|
|
3159
|
-
)
|
|
3160
|
-
|
|
3161
|
-
|
|
3162
|
-
class DeltaLakeMasked(BaseModel):
|
|
3163
|
-
"""
|
|
3164
|
-
Optional object-store credentials for the warehouse location.
|
|
3165
|
-
"""
|
|
3166
|
-
|
|
3167
|
-
model_config = ConfigDict(
|
|
3168
|
-
extra='forbid',
|
|
3169
|
-
)
|
|
3170
|
-
s3_access_key_id: str | None = Field(
|
|
3171
|
-
None, description='S3 access key id for object-store warehouses'
|
|
3172
|
-
)
|
|
3173
|
-
s3_secret_access_key: str | None = Field(None, description='S3 secret access key')
|
|
3174
|
-
s3_session_token: str | None = Field(None, description='Optional S3 session token')
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
class DeltaLakeOptionalConnection(BaseModel):
|
|
3178
|
-
"""
|
|
3179
|
-
Delta Lake connection and storage options.
|
|
3180
|
-
"""
|
|
3181
|
-
|
|
3182
|
-
model_config = ConfigDict(
|
|
3183
|
-
extra='forbid',
|
|
3184
|
-
)
|
|
3185
|
-
metastore_uri: str | None = Field(
|
|
3186
|
-
None,
|
|
3187
|
-
description='Hive Metastore thrift URI; enables catalog-based table discovery',
|
|
3188
|
-
)
|
|
3189
|
-
endpoint_url: str | None = Field(
|
|
3190
|
-
None, description='Custom S3-compatible endpoint URL'
|
|
3141
|
+
description='Bucket holding the tables — AWS S3, MinIO, Cloudflare R2, Backblaze B2, Garage, and other S3-compatible endpoints',
|
|
3191
3142
|
)
|
|
3192
|
-
region: str | None = Field(None, description='Object-store region')
|
|
3193
3143
|
|
|
3194
3144
|
|
|
3195
3145
|
class DeltaLakeOptionalScope(BaseModel):
|
|
3196
3146
|
"""
|
|
3197
|
-
Delta Lake
|
|
3198
|
-
"""
|
|
3199
|
-
|
|
3200
|
-
model_config = ConfigDict(
|
|
3201
|
-
extra='forbid',
|
|
3202
|
-
)
|
|
3203
|
-
database: str | None = Field(
|
|
3204
|
-
None, description='Single database/namespace to scan (catalog mode)'
|
|
3205
|
-
)
|
|
3206
|
-
include_all_databases: bool | None = Field(
|
|
3207
|
-
False, description='Scan all visible databases except excluded system databases'
|
|
3208
|
-
)
|
|
3209
|
-
exclude_databases: list[str] | None = Field(
|
|
3210
|
-
['information_schema', 'sys'], description='Database denylist (exact names)'
|
|
3211
|
-
)
|
|
3212
|
-
include_tables: list[str] | None = Field(
|
|
3213
|
-
None,
|
|
3214
|
-
description='Optional table allowlist. Accepted forms: table or database.table',
|
|
3215
|
-
)
|
|
3216
|
-
table_limit: int | None = Field(
|
|
3217
|
-
None, description='Optional cap on number of table assets per database', ge=1
|
|
3218
|
-
)
|
|
3219
|
-
table_paths: list[str] | None = Field(
|
|
3220
|
-
None,
|
|
3221
|
-
description='Explicit Delta table locations to scan when no metastore is configured',
|
|
3222
|
-
)
|
|
3223
|
-
|
|
3224
|
-
|
|
3225
|
-
class DeltaLakeOptional(BaseModel):
|
|
3226
|
-
model_config = ConfigDict(
|
|
3227
|
-
extra='forbid',
|
|
3228
|
-
)
|
|
3229
|
-
connection: DeltaLakeOptionalConnection | None = None
|
|
3230
|
-
scope: DeltaLakeOptionalScope | None = None
|
|
3231
|
-
|
|
3232
|
-
|
|
3233
|
-
class DeltaLakeInput(CoreInput):
|
|
3234
|
-
type: Literal['DELTA_LAKE'] | None = Field(
|
|
3235
|
-
None, description='Type of the asset or source'
|
|
3236
|
-
)
|
|
3237
|
-
required: DeltaLakeRequired
|
|
3238
|
-
masked: DeltaLakeMasked | None = None
|
|
3239
|
-
optional: DeltaLakeOptional | None = None
|
|
3240
|
-
detectors: list[Detector] | None = Field(
|
|
3241
|
-
None, description='Detectors to run on ingested content'
|
|
3242
|
-
)
|
|
3243
|
-
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3244
|
-
None,
|
|
3245
|
-
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3246
|
-
)
|
|
3247
|
-
sampling: SamplingConfig
|
|
3248
|
-
resources: ResourceOverrides | None = None
|
|
3249
|
-
|
|
3250
|
-
|
|
3251
|
-
class HudiRequired(BaseModel):
|
|
3252
|
-
model_config = ConfigDict(
|
|
3253
|
-
extra='forbid',
|
|
3254
|
-
)
|
|
3255
|
-
warehouse_path: str = Field(
|
|
3256
|
-
..., description='Root storage location holding Hudi tables'
|
|
3257
|
-
)
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
class HudiMasked(BaseModel):
|
|
3261
|
-
"""
|
|
3262
|
-
Optional object-store credentials for the warehouse location.
|
|
3263
|
-
"""
|
|
3264
|
-
|
|
3265
|
-
model_config = ConfigDict(
|
|
3266
|
-
extra='forbid',
|
|
3267
|
-
)
|
|
3268
|
-
s3_access_key_id: str | None = Field(
|
|
3269
|
-
None, description='S3 access key id for object-store warehouses'
|
|
3270
|
-
)
|
|
3271
|
-
s3_secret_access_key: str | None = Field(None, description='S3 secret access key')
|
|
3272
|
-
s3_session_token: str | None = Field(None, description='Optional S3 session token')
|
|
3273
|
-
|
|
3274
|
-
|
|
3275
|
-
class HudiOptionalConnection(BaseModel):
|
|
3276
|
-
"""
|
|
3277
|
-
Hudi connection and storage options.
|
|
3147
|
+
Delta Lake table selection scope within the bucket.
|
|
3278
3148
|
"""
|
|
3279
3149
|
|
|
3280
3150
|
model_config = ConfigDict(
|
|
3281
3151
|
extra='forbid',
|
|
3282
3152
|
)
|
|
3283
|
-
|
|
3284
|
-
None,
|
|
3285
|
-
description='Hive Metastore thrift URI; enables catalog-based table discovery',
|
|
3286
|
-
)
|
|
3287
|
-
endpoint_url: str | None = Field(
|
|
3288
|
-
None, description='Custom S3-compatible endpoint URL'
|
|
3289
|
-
)
|
|
3290
|
-
region: str | None = Field(None, description='Object-store region')
|
|
3291
|
-
|
|
3292
|
-
|
|
3293
|
-
class HudiOptionalScope(BaseModel):
|
|
3294
|
-
"""
|
|
3295
|
-
Hudi database and table selection scope.
|
|
3296
|
-
"""
|
|
3297
|
-
|
|
3298
|
-
model_config = ConfigDict(
|
|
3299
|
-
extra='forbid',
|
|
3300
|
-
)
|
|
3301
|
-
database: str | None = Field(
|
|
3302
|
-
None, description='Single database/namespace to scan (catalog mode)'
|
|
3303
|
-
)
|
|
3304
|
-
include_all_databases: bool | None = Field(
|
|
3305
|
-
False, description='Scan all visible databases except excluded system databases'
|
|
3306
|
-
)
|
|
3307
|
-
exclude_databases: list[str] | None = Field(
|
|
3308
|
-
['information_schema', 'sys'], description='Database denylist (exact names)'
|
|
3309
|
-
)
|
|
3310
|
-
include_tables: list[str] | None = Field(
|
|
3153
|
+
prefix: str | None = Field(
|
|
3311
3154
|
None,
|
|
3312
|
-
description='
|
|
3313
|
-
)
|
|
3314
|
-
table_limit: int | None = Field(
|
|
3315
|
-
None, description='Optional cap on number of table assets per database', ge=1
|
|
3155
|
+
description='Key prefix to search for Delta Lake tables (e.g. warehouse/). Tables are auto-discovered by their _delta_log/ directory.',
|
|
3316
3156
|
)
|
|
3317
3157
|
table_paths: list[str] | None = Field(
|
|
3318
3158
|
None,
|
|
3319
|
-
description='Explicit
|
|
3320
|
-
)
|
|
3321
|
-
|
|
3322
|
-
|
|
3323
|
-
class HudiOptional(BaseModel):
|
|
3324
|
-
model_config = ConfigDict(
|
|
3325
|
-
extra='forbid',
|
|
3326
|
-
)
|
|
3327
|
-
connection: HudiOptionalConnection | None = None
|
|
3328
|
-
scope: HudiOptionalScope | None = None
|
|
3329
|
-
|
|
3330
|
-
|
|
3331
|
-
class HudiInput(CoreInput):
|
|
3332
|
-
type: Literal['HUDI'] | None = Field(
|
|
3333
|
-
None, description='Type of the asset or source'
|
|
3334
|
-
)
|
|
3335
|
-
required: HudiRequired
|
|
3336
|
-
masked: HudiMasked | None = None
|
|
3337
|
-
optional: HudiOptional | None = None
|
|
3338
|
-
detectors: list[Detector] | None = Field(
|
|
3339
|
-
None, description='Detectors to run on ingested content'
|
|
3340
|
-
)
|
|
3341
|
-
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3342
|
-
None,
|
|
3343
|
-
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3344
|
-
)
|
|
3345
|
-
sampling: SamplingConfig
|
|
3346
|
-
resources: ResourceOverrides | None = None
|
|
3347
|
-
|
|
3348
|
-
|
|
3349
|
-
class SparkCatalogRequired(BaseModel):
|
|
3350
|
-
model_config = ConfigDict(
|
|
3351
|
-
extra='forbid',
|
|
3352
|
-
)
|
|
3353
|
-
connect_url: str = Field(
|
|
3354
|
-
...,
|
|
3355
|
-
description='Spark Connect endpoint (sc://host:15002) or classic master (spark://host:7077)',
|
|
3356
|
-
)
|
|
3357
|
-
|
|
3358
|
-
|
|
3359
|
-
class SparkCatalogMasked(BaseModel):
|
|
3360
|
-
"""
|
|
3361
|
-
Optional Spark Connect authentication.
|
|
3362
|
-
"""
|
|
3363
|
-
|
|
3364
|
-
model_config = ConfigDict(
|
|
3365
|
-
extra='forbid',
|
|
3366
|
-
)
|
|
3367
|
-
token: str | None = Field(
|
|
3368
|
-
None, description='Bearer token for Spark Connect authentication'
|
|
3369
|
-
)
|
|
3370
|
-
|
|
3371
|
-
|
|
3372
|
-
class SparkCatalogOptionalScope(BaseModel):
|
|
3373
|
-
"""
|
|
3374
|
-
Spark catalog and table selection scope.
|
|
3375
|
-
"""
|
|
3376
|
-
|
|
3377
|
-
model_config = ConfigDict(
|
|
3378
|
-
extra='forbid',
|
|
3379
|
-
)
|
|
3380
|
-
catalog: str | None = Field(
|
|
3381
|
-
None, description='Spark catalog name to scan (defaults to the session catalog)'
|
|
3382
|
-
)
|
|
3383
|
-
database: str | None = Field(None, description='Single database/namespace to scan')
|
|
3384
|
-
include_all_databases: bool | None = Field(
|
|
3385
|
-
False, description='Scan all visible databases except excluded system databases'
|
|
3386
|
-
)
|
|
3387
|
-
exclude_databases: list[str] | None = Field(
|
|
3388
|
-
['information_schema', 'sys'], description='Database denylist (exact names)'
|
|
3389
|
-
)
|
|
3390
|
-
include_tables: list[str] | None = Field(
|
|
3391
|
-
None,
|
|
3392
|
-
description='Optional table allowlist. Accepted forms: table or database.table',
|
|
3159
|
+
description='Explicit Delta Lake table root keys or s3:// URIs. When set, auto-discovery under prefix is skipped.',
|
|
3393
3160
|
)
|
|
3394
3161
|
table_limit: int | None = Field(
|
|
3395
|
-
None, description='Optional cap on number of table assets
|
|
3162
|
+
None, description='Optional cap on number of table assets', ge=1
|
|
3396
3163
|
)
|
|
3397
3164
|
|
|
3398
3165
|
|
|
3399
|
-
class SparkCatalogOptional(BaseModel):
|
|
3400
|
-
model_config = ConfigDict(
|
|
3401
|
-
extra='forbid',
|
|
3402
|
-
)
|
|
3403
|
-
scope: SparkCatalogOptionalScope | None = None
|
|
3404
|
-
|
|
3405
|
-
|
|
3406
|
-
class SparkCatalogInput(CoreInput):
|
|
3407
|
-
type: Literal['SPARK_CATALOG'] | None = Field(
|
|
3408
|
-
None, description='Type of the asset or source'
|
|
3409
|
-
)
|
|
3410
|
-
required: SparkCatalogRequired
|
|
3411
|
-
masked: SparkCatalogMasked | None = None
|
|
3412
|
-
optional: SparkCatalogOptional | None = None
|
|
3413
|
-
detectors: list[Detector] | None = Field(
|
|
3414
|
-
None, description='Detectors to run on ingested content'
|
|
3415
|
-
)
|
|
3416
|
-
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3417
|
-
None,
|
|
3418
|
-
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3419
|
-
)
|
|
3420
|
-
sampling: SamplingConfig
|
|
3421
|
-
resources: ResourceOverrides | None = None
|
|
3422
|
-
|
|
3423
|
-
|
|
3424
3166
|
class IcebergRequired(BaseModel):
|
|
3425
3167
|
model_config = ConfigDict(
|
|
3426
3168
|
extra='forbid',
|
|
3427
3169
|
)
|
|
3428
|
-
|
|
3429
|
-
|
|
3430
|
-
|
|
3431
|
-
description='Catalog URI (REST endpoint, Hive metastore thrift URI, or SQL DSN). Not required for GLUE.',
|
|
3432
|
-
)
|
|
3433
|
-
warehouse: str = Field(
|
|
3434
|
-
..., description='Warehouse location root (e.g. s3://bucket/warehouse)'
|
|
3435
|
-
)
|
|
3436
|
-
|
|
3437
|
-
|
|
3438
|
-
class IcebergMasked(BaseModel):
|
|
3439
|
-
"""
|
|
3440
|
-
Optional Iceberg catalog/storage credentials.
|
|
3441
|
-
"""
|
|
3442
|
-
|
|
3443
|
-
model_config = ConfigDict(
|
|
3444
|
-
extra='forbid',
|
|
3445
|
-
)
|
|
3446
|
-
token: str | None = Field(None, description='Bearer token for a REST catalog')
|
|
3447
|
-
aws_access_key_id: str | None = Field(
|
|
3448
|
-
None, description='AWS access key id (Glue/S3)'
|
|
3449
|
-
)
|
|
3450
|
-
aws_secret_access_key: str | None = Field(
|
|
3451
|
-
None, description='AWS secret access key (Glue/S3)'
|
|
3452
|
-
)
|
|
3453
|
-
|
|
3454
|
-
|
|
3455
|
-
class IcebergOptionalScope(BaseModel):
|
|
3456
|
-
"""
|
|
3457
|
-
Iceberg namespace and table selection scope.
|
|
3458
|
-
"""
|
|
3459
|
-
|
|
3460
|
-
model_config = ConfigDict(
|
|
3461
|
-
extra='forbid',
|
|
3462
|
-
)
|
|
3463
|
-
namespace: str | None = Field(
|
|
3464
|
-
None, description='Single namespace to scan (dotted form supported)'
|
|
3465
|
-
)
|
|
3466
|
-
include_all_namespaces: bool | None = Field(
|
|
3467
|
-
False, description='Scan all visible namespaces'
|
|
3468
|
-
)
|
|
3469
|
-
include_tables: list[str] | None = Field(
|
|
3470
|
-
None,
|
|
3471
|
-
description='Optional table allowlist. Accepted forms: table or namespace.table',
|
|
3472
|
-
)
|
|
3473
|
-
table_limit: int | None = Field(
|
|
3474
|
-
None, description='Optional cap on number of table assets per namespace', ge=1
|
|
3475
|
-
)
|
|
3476
|
-
|
|
3477
|
-
|
|
3478
|
-
class IcebergOptional(BaseModel):
|
|
3479
|
-
model_config = ConfigDict(
|
|
3480
|
-
extra='forbid',
|
|
3481
|
-
)
|
|
3482
|
-
scope: IcebergOptionalScope | None = None
|
|
3483
|
-
|
|
3484
|
-
|
|
3485
|
-
class IcebergInput(CoreInput):
|
|
3486
|
-
type: Literal['ICEBERG'] | None = Field(
|
|
3487
|
-
None, description='Type of the asset or source'
|
|
3488
|
-
)
|
|
3489
|
-
required: IcebergRequired
|
|
3490
|
-
masked: IcebergMasked | None = None
|
|
3491
|
-
optional: IcebergOptional | None = None
|
|
3492
|
-
detectors: list[Detector] | None = Field(
|
|
3493
|
-
None, description='Detectors to run on ingested content'
|
|
3494
|
-
)
|
|
3495
|
-
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3496
|
-
None,
|
|
3497
|
-
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3170
|
+
bucket: str = Field(
|
|
3171
|
+
...,
|
|
3172
|
+
description='Bucket holding the tables — AWS S3, MinIO, Cloudflare R2, Backblaze B2, Garage, and other S3-compatible endpoints',
|
|
3498
3173
|
)
|
|
3499
|
-
sampling: SamplingConfig
|
|
3500
|
-
resources: ResourceOverrides | None = None
|
|
3501
3174
|
|
|
3502
3175
|
|
|
3503
3176
|
class NoAuthentication(BaseModel):
|
|
@@ -3881,6 +3554,60 @@ class MeilisearchInput(CoreInput):
|
|
|
3881
3554
|
resources: ResourceOverrides | None = None
|
|
3882
3555
|
|
|
3883
3556
|
|
|
3557
|
+
class LakehouseStorageConnection(BaseModel):
|
|
3558
|
+
"""
|
|
3559
|
+
S3-compatible storage connection options (AWS S3, MinIO, Cloudflare R2, Backblaze B2, Garage, ...). Mirrors the S3 Compatible Storage source connection settings.
|
|
3560
|
+
"""
|
|
3561
|
+
|
|
3562
|
+
model_config = ConfigDict(
|
|
3563
|
+
extra='forbid',
|
|
3564
|
+
)
|
|
3565
|
+
region_name: str | None = Field(
|
|
3566
|
+
None,
|
|
3567
|
+
description='Region (recommended for AWS; required by some S3-compatible providers)',
|
|
3568
|
+
)
|
|
3569
|
+
endpoint_url: AnyUrl | None = Field(
|
|
3570
|
+
None,
|
|
3571
|
+
description='Custom endpoint URL for MinIO/R2/B2/Garage and other S3-compatible providers',
|
|
3572
|
+
)
|
|
3573
|
+
request_timeout_seconds: float | None = Field(
|
|
3574
|
+
30,
|
|
3575
|
+
description='Network timeout in seconds for storage list/read operations',
|
|
3576
|
+
ge=1.0,
|
|
3577
|
+
le=300.0,
|
|
3578
|
+
)
|
|
3579
|
+
max_keys_per_page: int | None = Field(
|
|
3580
|
+
1000,
|
|
3581
|
+
description='Maximum objects requested per provider list API call during table discovery',
|
|
3582
|
+
ge=1,
|
|
3583
|
+
le=1000,
|
|
3584
|
+
)
|
|
3585
|
+
verify_ssl: bool | None = Field(
|
|
3586
|
+
True, description='TLS certificate verification toggle'
|
|
3587
|
+
)
|
|
3588
|
+
|
|
3589
|
+
|
|
3590
|
+
class IcebergOptionalScope(BaseModel):
|
|
3591
|
+
"""
|
|
3592
|
+
Apache Iceberg table selection scope within the bucket.
|
|
3593
|
+
"""
|
|
3594
|
+
|
|
3595
|
+
model_config = ConfigDict(
|
|
3596
|
+
extra='forbid',
|
|
3597
|
+
)
|
|
3598
|
+
prefix: str | None = Field(
|
|
3599
|
+
None,
|
|
3600
|
+
description='Key prefix to search for Apache Iceberg tables (e.g. warehouse/). Tables are auto-discovered by their metadata/ directory.',
|
|
3601
|
+
)
|
|
3602
|
+
table_paths: list[str] | None = Field(
|
|
3603
|
+
None,
|
|
3604
|
+
description='Explicit Apache Iceberg table root keys or s3:// URIs. When set, auto-discovery under prefix is skipped.',
|
|
3605
|
+
)
|
|
3606
|
+
table_limit: int | None = Field(
|
|
3607
|
+
None, description='Optional cap on number of table assets', ge=1
|
|
3608
|
+
)
|
|
3609
|
+
|
|
3610
|
+
|
|
3884
3611
|
class YouTubeInput(CoreInput):
|
|
3885
3612
|
type: Literal['YOUTUBE'] | None = Field(
|
|
3886
3613
|
None, description='Type of the asset or source'
|
|
@@ -3899,6 +3626,58 @@ class YouTubeInput(CoreInput):
|
|
|
3899
3626
|
resources: ResourceOverrides | None = None
|
|
3900
3627
|
|
|
3901
3628
|
|
|
3629
|
+
class DeltaLakeOptional(BaseModel):
|
|
3630
|
+
model_config = ConfigDict(
|
|
3631
|
+
extra='forbid',
|
|
3632
|
+
)
|
|
3633
|
+
connection: LakehouseStorageConnection | None = None
|
|
3634
|
+
scope: DeltaLakeOptionalScope | None = None
|
|
3635
|
+
|
|
3636
|
+
|
|
3637
|
+
class DeltaLakeInput(CoreInput):
|
|
3638
|
+
type: Literal['DELTA_LAKE'] | None = Field(
|
|
3639
|
+
None, description='Type of the asset or source'
|
|
3640
|
+
)
|
|
3641
|
+
required: DeltaLakeRequired
|
|
3642
|
+
masked: S3CompatibleStorageMasked | None = None
|
|
3643
|
+
optional: DeltaLakeOptional | None = None
|
|
3644
|
+
detectors: list[Detector] | None = Field(
|
|
3645
|
+
None, description='Detectors to run on ingested content'
|
|
3646
|
+
)
|
|
3647
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3648
|
+
None,
|
|
3649
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3650
|
+
)
|
|
3651
|
+
sampling: SamplingConfig
|
|
3652
|
+
resources: ResourceOverrides | None = None
|
|
3653
|
+
|
|
3654
|
+
|
|
3655
|
+
class IcebergOptional(BaseModel):
|
|
3656
|
+
model_config = ConfigDict(
|
|
3657
|
+
extra='forbid',
|
|
3658
|
+
)
|
|
3659
|
+
connection: LakehouseStorageConnection | None = None
|
|
3660
|
+
scope: IcebergOptionalScope | None = None
|
|
3661
|
+
|
|
3662
|
+
|
|
3663
|
+
class IcebergInput(CoreInput):
|
|
3664
|
+
type: Literal['ICEBERG'] | None = Field(
|
|
3665
|
+
None, description='Type of the asset or source'
|
|
3666
|
+
)
|
|
3667
|
+
required: IcebergRequired
|
|
3668
|
+
masked: S3CompatibleStorageMasked | None = None
|
|
3669
|
+
optional: IcebergOptional | None = None
|
|
3670
|
+
detectors: list[Detector] | None = Field(
|
|
3671
|
+
None, description='Detectors to run on ingested content'
|
|
3672
|
+
)
|
|
3673
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3674
|
+
None,
|
|
3675
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3676
|
+
)
|
|
3677
|
+
sampling: SamplingConfig
|
|
3678
|
+
resources: ResourceOverrides | None = None
|
|
3679
|
+
|
|
3680
|
+
|
|
3902
3681
|
class SourceInput(
|
|
3903
3682
|
RootModel[
|
|
3904
3683
|
SlackInput
|
|
@@ -3926,8 +3705,6 @@ class SourceInput(
|
|
|
3926
3705
|
| YouTubeInput
|
|
3927
3706
|
| DeltaLakeInput
|
|
3928
3707
|
| IcebergInput
|
|
3929
|
-
| HudiInput
|
|
3930
|
-
| SparkCatalogInput
|
|
3931
3708
|
| KafkaInput
|
|
3932
3709
|
| ElasticsearchInput
|
|
3933
3710
|
| OpenSearchInput
|
|
@@ -3960,8 +3737,6 @@ class SourceInput(
|
|
|
3960
3737
|
| YouTubeInput
|
|
3961
3738
|
| DeltaLakeInput
|
|
3962
3739
|
| IcebergInput
|
|
3963
|
-
| HudiInput
|
|
3964
|
-
| SparkCatalogInput
|
|
3965
3740
|
| KafkaInput
|
|
3966
3741
|
| ElasticsearchInput
|
|
3967
3742
|
| OpenSearchInput
|