classifyre-cli 0.4.33__tar.gz → 0.4.34__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/.gitignore +3 -0
- classifyre_cli-0.4.34/.turbo/turbo-build.log +3 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/PKG-INFO +1 -1
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/package.json +1 -1
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/pyproject.toml +28 -1
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/models/generated_input.py +493 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/outputs/rest.py +1 -3
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/base.py +1 -3
- classifyre_cli-0.4.34/src/sources/delta_lake/__init__.py +3 -0
- classifyre_cli-0.4.34/src/sources/delta_lake/source.py +139 -0
- classifyre_cli-0.4.34/src/sources/hudi/__init__.py +3 -0
- classifyre_cli-0.4.34/src/sources/hudi/source.py +98 -0
- classifyre_cli-0.4.34/src/sources/iceberg/__init__.py +3 -0
- classifyre_cli-0.4.34/src/sources/iceberg/source.py +148 -0
- classifyre_cli-0.4.34/src/sources/kafka/__init__.py +3 -0
- classifyre_cli-0.4.34/src/sources/kafka/source.py +343 -0
- classifyre_cli-0.4.34/src/sources/spark_base.py +413 -0
- classifyre_cli-0.4.34/src/sources/spark_catalog/__init__.py +3 -0
- classifyre_cli-0.4.34/src/sources/spark_catalog/source.py +85 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/tabular_base.py +4 -1
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/utils/dependency_groups.py +5 -0
- classifyre_cli-0.4.34/src/utils/spark_runtime.py +56 -0
- classifyre_cli-0.4.34/tests/_spark_fakes.py +125 -0
- classifyre_cli-0.4.34/tests/test_delta_lake_source.py +96 -0
- classifyre_cli-0.4.34/tests/test_hudi_source.py +72 -0
- classifyre_cli-0.4.34/tests/test_iceberg_source.py +95 -0
- classifyre_cli-0.4.34/tests/test_kafka_source.py +141 -0
- classifyre_cli-0.4.34/tests/test_spark_catalog_source.py +71 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/uv.lock +605 -568
- classifyre_cli-0.4.33/.turbo/turbo-build.log +0 -3
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/.python-version +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/README.md +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/main.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/scripts/generate_models.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/config.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/broken_links/detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/extractor.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_llm.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/runners/_text_classification.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/main.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/models/generated_detectors.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/models/generated_single_asset_scan_results.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/pipeline/detector_pipeline.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/pipeline/worker_pool.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/asset_metadata.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/confluence/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/databricks/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/email/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/email/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/hive/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/jira/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/mongodb/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/mssql/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/mysql/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/neo4j/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/neo4j/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/notion/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/notion/client.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/notion/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/object_storage/base.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/oracle/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/postgresql/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/powerbi/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/s3_compatible_storage/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/servicedesk/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/slack/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/snowflake/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/sqlite/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/sqlite/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/tableau/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/tabular_utils.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/wordpress/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/youtube/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/sources/youtube/source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/utils/embedded_images.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/utils/file_metadata.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/utils/file_parser.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/utils/file_to_images.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/utils/resources.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/utils/transcription.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/custom/test_llm_runner.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/pipeline/test_detector_pipeline.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/pipeline/test_worker_pool.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_assets_metadata_catalog.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_config.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_custom_extractor.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_databricks_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_dependency_groups.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_email_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_hive_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_mssql_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_mysql_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_notion_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_oracle_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_outputs.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_postgresql_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_s3_compatible_storage_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_sampling_automatic.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_sandbox_runner.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_snowflake_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_source_dependency_groups.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_sqlite_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_tabular_automatic_sampling.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_uv_sync.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_youtube_source.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/test_youtube_source_integration.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/utils/test_embedded_images.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/utils/test_file_metadata.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/utils/test_file_parser.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/utils/test_file_to_images.py +0 -0
- {classifyre_cli-0.4.33 → classifyre_cli-0.4.34}/tests/utils/test_transcription.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "classifyre-cli"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.34"
|
|
4
4
|
description = "Classifyre CLI — scan and classify unstructured data sources"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -202,6 +202,33 @@ youtube = [
|
|
|
202
202
|
"yt-dlp>=2025.1.0",
|
|
203
203
|
"youtube-transcript-api>=1.0.0",
|
|
204
204
|
]
|
|
205
|
+
spark = [
|
|
206
|
+
# Shared PySpark runtime for the lakehouse sources. Requires a JDK (Java 21
|
|
207
|
+
# LTS) on the host; format JARs (Delta/Iceberg/Hudi) are resolved at runtime
|
|
208
|
+
# via spark.jars.packages (see SPARK_* env vars). Pinned to the Spark 4.1
|
|
209
|
+
# minor so the format-JAR coordinates below stay version-matched.
|
|
210
|
+
"pyspark>=4.1,<4.2",
|
|
211
|
+
]
|
|
212
|
+
delta-lake = [
|
|
213
|
+
{ include-group = "spark" },
|
|
214
|
+
"delta-spark>=4.0",
|
|
215
|
+
]
|
|
216
|
+
hudi = [
|
|
217
|
+
# Hudi Spark integration ships as Maven JARs configured via
|
|
218
|
+
# SPARK_JARS_PACKAGES; only the shared PySpark runtime is needed here.
|
|
219
|
+
{ include-group = "spark" },
|
|
220
|
+
]
|
|
221
|
+
spark-catalog = [
|
|
222
|
+
{ include-group = "spark" },
|
|
223
|
+
]
|
|
224
|
+
iceberg = [
|
|
225
|
+
# Inspected through Spark (iceberg-spark-runtime JARs via SPARK_JARS_PACKAGES),
|
|
226
|
+
# sharing the JVM the other lakehouse sources already require.
|
|
227
|
+
{ include-group = "spark" },
|
|
228
|
+
]
|
|
229
|
+
kafka = [
|
|
230
|
+
"kafka-python>=2.0",
|
|
231
|
+
]
|
|
205
232
|
otel = [
|
|
206
233
|
"opentelemetry-sdk>=1.42.0",
|
|
207
234
|
"opentelemetry-exporter-otlp-proto-http>=1.27.0",
|
|
@@ -45,6 +45,11 @@ class AssetType(StrEnum):
|
|
|
45
45
|
NOTION = 'NOTION'
|
|
46
46
|
EMAIL = 'EMAIL'
|
|
47
47
|
YOUTUBE = 'YOUTUBE'
|
|
48
|
+
DELTA_LAKE = 'DELTA_LAKE'
|
|
49
|
+
ICEBERG = 'ICEBERG'
|
|
50
|
+
HUDI = 'HUDI'
|
|
51
|
+
SPARK_CATALOG = 'SPARK_CATALOG'
|
|
52
|
+
KAFKA = 'KAFKA'
|
|
48
53
|
|
|
49
54
|
|
|
50
55
|
class DetectorType(StrEnum):
|
|
@@ -343,6 +348,11 @@ class Type(StrEnum):
|
|
|
343
348
|
NOTION = 'NOTION'
|
|
344
349
|
EMAIL = 'EMAIL'
|
|
345
350
|
YOUTUBE = 'YOUTUBE'
|
|
351
|
+
DELTA_LAKE = 'DELTA_LAKE'
|
|
352
|
+
ICEBERG = 'ICEBERG'
|
|
353
|
+
HUDI = 'HUDI'
|
|
354
|
+
SPARK_CATALOG = 'SPARK_CATALOG'
|
|
355
|
+
KAFKA = 'KAFKA'
|
|
346
356
|
|
|
347
357
|
|
|
348
358
|
class YouTubeRequired(BaseModel):
|
|
@@ -2850,6 +2860,11 @@ class Type19(StrEnum):
|
|
|
2850
2860
|
NOTION = 'NOTION'
|
|
2851
2861
|
EMAIL = 'EMAIL'
|
|
2852
2862
|
YOUTUBE = 'YOUTUBE'
|
|
2863
|
+
DELTA_LAKE = 'DELTA_LAKE'
|
|
2864
|
+
ICEBERG = 'ICEBERG'
|
|
2865
|
+
HUDI = 'HUDI'
|
|
2866
|
+
SPARK_CATALOG = 'SPARK_CATALOG'
|
|
2867
|
+
KAFKA = 'KAFKA'
|
|
2853
2868
|
|
|
2854
2869
|
|
|
2855
2870
|
class ConfluenceInput(CoreInput):
|
|
@@ -3093,6 +3108,474 @@ class NotionInput(CoreInput):
|
|
|
3093
3108
|
resources: ResourceOverrides | None = None
|
|
3094
3109
|
|
|
3095
3110
|
|
|
3111
|
+
class IcebergCatalogType(StrEnum):
|
|
3112
|
+
"""
|
|
3113
|
+
PyIceberg catalog backend type
|
|
3114
|
+
"""
|
|
3115
|
+
|
|
3116
|
+
REST = 'REST'
|
|
3117
|
+
HIVE = 'HIVE'
|
|
3118
|
+
GLUE = 'GLUE'
|
|
3119
|
+
SQL = 'SQL'
|
|
3120
|
+
|
|
3121
|
+
|
|
3122
|
+
class KafkaSecurityProtocol(StrEnum):
|
|
3123
|
+
"""
|
|
3124
|
+
Kafka client security protocol
|
|
3125
|
+
"""
|
|
3126
|
+
|
|
3127
|
+
PLAINTEXT = 'PLAINTEXT'
|
|
3128
|
+
SSL = 'SSL'
|
|
3129
|
+
SASL_PLAINTEXT = 'SASL_PLAINTEXT'
|
|
3130
|
+
SASL_SSL = 'SASL_SSL'
|
|
3131
|
+
|
|
3132
|
+
|
|
3133
|
+
class KafkaSaslMechanism(StrEnum):
|
|
3134
|
+
"""
|
|
3135
|
+
SASL mechanism used when security_protocol is SASL_*
|
|
3136
|
+
"""
|
|
3137
|
+
|
|
3138
|
+
PLAIN = 'PLAIN'
|
|
3139
|
+
SCRAM_SHA_256 = 'SCRAM-SHA-256'
|
|
3140
|
+
SCRAM_SHA_512 = 'SCRAM-SHA-512'
|
|
3141
|
+
|
|
3142
|
+
|
|
3143
|
+
class DeltaLakeRequired(BaseModel):
|
|
3144
|
+
model_config = ConfigDict(
|
|
3145
|
+
extra='forbid',
|
|
3146
|
+
)
|
|
3147
|
+
warehouse_path: str = Field(
|
|
3148
|
+
...,
|
|
3149
|
+
description='Root storage location holding Delta tables (e.g. s3a://lake/warehouse, file:///data)',
|
|
3150
|
+
)
|
|
3151
|
+
|
|
3152
|
+
|
|
3153
|
+
class DeltaLakeMasked(BaseModel):
|
|
3154
|
+
"""
|
|
3155
|
+
Optional object-store credentials for the warehouse location.
|
|
3156
|
+
"""
|
|
3157
|
+
|
|
3158
|
+
model_config = ConfigDict(
|
|
3159
|
+
extra='forbid',
|
|
3160
|
+
)
|
|
3161
|
+
s3_access_key_id: str | None = Field(
|
|
3162
|
+
None, description='S3 access key id for object-store warehouses'
|
|
3163
|
+
)
|
|
3164
|
+
s3_secret_access_key: str | None = Field(None, description='S3 secret access key')
|
|
3165
|
+
s3_session_token: str | None = Field(None, description='Optional S3 session token')
|
|
3166
|
+
|
|
3167
|
+
|
|
3168
|
+
class DeltaLakeOptionalConnection(BaseModel):
|
|
3169
|
+
"""
|
|
3170
|
+
Delta Lake connection and storage options.
|
|
3171
|
+
"""
|
|
3172
|
+
|
|
3173
|
+
model_config = ConfigDict(
|
|
3174
|
+
extra='forbid',
|
|
3175
|
+
)
|
|
3176
|
+
metastore_uri: str | None = Field(
|
|
3177
|
+
None,
|
|
3178
|
+
description='Hive Metastore thrift URI; enables catalog-based table discovery',
|
|
3179
|
+
)
|
|
3180
|
+
endpoint_url: str | None = Field(
|
|
3181
|
+
None, description='Custom S3-compatible endpoint URL'
|
|
3182
|
+
)
|
|
3183
|
+
region: str | None = Field(None, description='Object-store region')
|
|
3184
|
+
|
|
3185
|
+
|
|
3186
|
+
class DeltaLakeOptionalScope(BaseModel):
|
|
3187
|
+
"""
|
|
3188
|
+
Delta Lake database and table selection scope.
|
|
3189
|
+
"""
|
|
3190
|
+
|
|
3191
|
+
model_config = ConfigDict(
|
|
3192
|
+
extra='forbid',
|
|
3193
|
+
)
|
|
3194
|
+
database: str | None = Field(
|
|
3195
|
+
None, description='Single database/namespace to scan (catalog mode)'
|
|
3196
|
+
)
|
|
3197
|
+
include_all_databases: bool | None = Field(
|
|
3198
|
+
False, description='Scan all visible databases except excluded system databases'
|
|
3199
|
+
)
|
|
3200
|
+
exclude_databases: list[str] | None = Field(
|
|
3201
|
+
['information_schema', 'sys'], description='Database denylist (exact names)'
|
|
3202
|
+
)
|
|
3203
|
+
include_tables: list[str] | None = Field(
|
|
3204
|
+
None,
|
|
3205
|
+
description='Optional table allowlist. Accepted forms: table or database.table',
|
|
3206
|
+
)
|
|
3207
|
+
table_limit: int | None = Field(
|
|
3208
|
+
None, description='Optional cap on number of table assets per database', ge=1
|
|
3209
|
+
)
|
|
3210
|
+
table_paths: list[str] | None = Field(
|
|
3211
|
+
None,
|
|
3212
|
+
description='Explicit Delta table locations to scan when no metastore is configured',
|
|
3213
|
+
)
|
|
3214
|
+
|
|
3215
|
+
|
|
3216
|
+
class DeltaLakeOptional(BaseModel):
|
|
3217
|
+
model_config = ConfigDict(
|
|
3218
|
+
extra='forbid',
|
|
3219
|
+
)
|
|
3220
|
+
connection: DeltaLakeOptionalConnection | None = None
|
|
3221
|
+
scope: DeltaLakeOptionalScope | None = None
|
|
3222
|
+
|
|
3223
|
+
|
|
3224
|
+
class DeltaLakeInput(CoreInput):
|
|
3225
|
+
type: Literal['DELTA_LAKE'] | None = Field(
|
|
3226
|
+
None, description='Type of the asset or source'
|
|
3227
|
+
)
|
|
3228
|
+
required: DeltaLakeRequired
|
|
3229
|
+
masked: DeltaLakeMasked | None = None
|
|
3230
|
+
optional: DeltaLakeOptional | None = None
|
|
3231
|
+
detectors: list[Detector] | None = Field(
|
|
3232
|
+
None, description='Detectors to run on ingested content'
|
|
3233
|
+
)
|
|
3234
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3235
|
+
None,
|
|
3236
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3237
|
+
)
|
|
3238
|
+
sampling: SamplingConfig
|
|
3239
|
+
resources: ResourceOverrides | None = None
|
|
3240
|
+
|
|
3241
|
+
|
|
3242
|
+
class HudiRequired(BaseModel):
|
|
3243
|
+
model_config = ConfigDict(
|
|
3244
|
+
extra='forbid',
|
|
3245
|
+
)
|
|
3246
|
+
warehouse_path: str = Field(
|
|
3247
|
+
..., description='Root storage location holding Hudi tables'
|
|
3248
|
+
)
|
|
3249
|
+
|
|
3250
|
+
|
|
3251
|
+
class HudiMasked(BaseModel):
|
|
3252
|
+
"""
|
|
3253
|
+
Optional object-store credentials for the warehouse location.
|
|
3254
|
+
"""
|
|
3255
|
+
|
|
3256
|
+
model_config = ConfigDict(
|
|
3257
|
+
extra='forbid',
|
|
3258
|
+
)
|
|
3259
|
+
s3_access_key_id: str | None = Field(
|
|
3260
|
+
None, description='S3 access key id for object-store warehouses'
|
|
3261
|
+
)
|
|
3262
|
+
s3_secret_access_key: str | None = Field(None, description='S3 secret access key')
|
|
3263
|
+
s3_session_token: str | None = Field(None, description='Optional S3 session token')
|
|
3264
|
+
|
|
3265
|
+
|
|
3266
|
+
class HudiOptionalConnection(BaseModel):
|
|
3267
|
+
"""
|
|
3268
|
+
Hudi connection and storage options.
|
|
3269
|
+
"""
|
|
3270
|
+
|
|
3271
|
+
model_config = ConfigDict(
|
|
3272
|
+
extra='forbid',
|
|
3273
|
+
)
|
|
3274
|
+
metastore_uri: str | None = Field(
|
|
3275
|
+
None,
|
|
3276
|
+
description='Hive Metastore thrift URI; enables catalog-based table discovery',
|
|
3277
|
+
)
|
|
3278
|
+
endpoint_url: str | None = Field(
|
|
3279
|
+
None, description='Custom S3-compatible endpoint URL'
|
|
3280
|
+
)
|
|
3281
|
+
region: str | None = Field(None, description='Object-store region')
|
|
3282
|
+
|
|
3283
|
+
|
|
3284
|
+
class HudiOptionalScope(BaseModel):
|
|
3285
|
+
"""
|
|
3286
|
+
Hudi database and table selection scope.
|
|
3287
|
+
"""
|
|
3288
|
+
|
|
3289
|
+
model_config = ConfigDict(
|
|
3290
|
+
extra='forbid',
|
|
3291
|
+
)
|
|
3292
|
+
database: str | None = Field(
|
|
3293
|
+
None, description='Single database/namespace to scan (catalog mode)'
|
|
3294
|
+
)
|
|
3295
|
+
include_all_databases: bool | None = Field(
|
|
3296
|
+
False, description='Scan all visible databases except excluded system databases'
|
|
3297
|
+
)
|
|
3298
|
+
exclude_databases: list[str] | None = Field(
|
|
3299
|
+
['information_schema', 'sys'], description='Database denylist (exact names)'
|
|
3300
|
+
)
|
|
3301
|
+
include_tables: list[str] | None = Field(
|
|
3302
|
+
None,
|
|
3303
|
+
description='Optional table allowlist. Accepted forms: table or database.table',
|
|
3304
|
+
)
|
|
3305
|
+
table_limit: int | None = Field(
|
|
3306
|
+
None, description='Optional cap on number of table assets per database', ge=1
|
|
3307
|
+
)
|
|
3308
|
+
table_paths: list[str] | None = Field(
|
|
3309
|
+
None,
|
|
3310
|
+
description='Explicit Hudi table locations to scan when no metastore is configured',
|
|
3311
|
+
)
|
|
3312
|
+
|
|
3313
|
+
|
|
3314
|
+
class HudiOptional(BaseModel):
|
|
3315
|
+
model_config = ConfigDict(
|
|
3316
|
+
extra='forbid',
|
|
3317
|
+
)
|
|
3318
|
+
connection: HudiOptionalConnection | None = None
|
|
3319
|
+
scope: HudiOptionalScope | None = None
|
|
3320
|
+
|
|
3321
|
+
|
|
3322
|
+
class HudiInput(CoreInput):
|
|
3323
|
+
type: Literal['HUDI'] | None = Field(
|
|
3324
|
+
None, description='Type of the asset or source'
|
|
3325
|
+
)
|
|
3326
|
+
required: HudiRequired
|
|
3327
|
+
masked: HudiMasked | None = None
|
|
3328
|
+
optional: HudiOptional | None = None
|
|
3329
|
+
detectors: list[Detector] | None = Field(
|
|
3330
|
+
None, description='Detectors to run on ingested content'
|
|
3331
|
+
)
|
|
3332
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3333
|
+
None,
|
|
3334
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3335
|
+
)
|
|
3336
|
+
sampling: SamplingConfig
|
|
3337
|
+
resources: ResourceOverrides | None = None
|
|
3338
|
+
|
|
3339
|
+
|
|
3340
|
+
class SparkCatalogRequired(BaseModel):
|
|
3341
|
+
model_config = ConfigDict(
|
|
3342
|
+
extra='forbid',
|
|
3343
|
+
)
|
|
3344
|
+
connect_url: str = Field(
|
|
3345
|
+
...,
|
|
3346
|
+
description='Spark Connect endpoint (sc://host:15002) or classic master (spark://host:7077)',
|
|
3347
|
+
)
|
|
3348
|
+
|
|
3349
|
+
|
|
3350
|
+
class SparkCatalogMasked(BaseModel):
|
|
3351
|
+
"""
|
|
3352
|
+
Optional Spark Connect authentication.
|
|
3353
|
+
"""
|
|
3354
|
+
|
|
3355
|
+
model_config = ConfigDict(
|
|
3356
|
+
extra='forbid',
|
|
3357
|
+
)
|
|
3358
|
+
token: str | None = Field(
|
|
3359
|
+
None, description='Bearer token for Spark Connect authentication'
|
|
3360
|
+
)
|
|
3361
|
+
|
|
3362
|
+
|
|
3363
|
+
class SparkCatalogOptionalScope(BaseModel):
|
|
3364
|
+
"""
|
|
3365
|
+
Spark catalog and table selection scope.
|
|
3366
|
+
"""
|
|
3367
|
+
|
|
3368
|
+
model_config = ConfigDict(
|
|
3369
|
+
extra='forbid',
|
|
3370
|
+
)
|
|
3371
|
+
catalog: str | None = Field(
|
|
3372
|
+
None, description='Spark catalog name to scan (defaults to the session catalog)'
|
|
3373
|
+
)
|
|
3374
|
+
database: str | None = Field(None, description='Single database/namespace to scan')
|
|
3375
|
+
include_all_databases: bool | None = Field(
|
|
3376
|
+
False, description='Scan all visible databases except excluded system databases'
|
|
3377
|
+
)
|
|
3378
|
+
exclude_databases: list[str] | None = Field(
|
|
3379
|
+
['information_schema', 'sys'], description='Database denylist (exact names)'
|
|
3380
|
+
)
|
|
3381
|
+
include_tables: list[str] | None = Field(
|
|
3382
|
+
None,
|
|
3383
|
+
description='Optional table allowlist. Accepted forms: table or database.table',
|
|
3384
|
+
)
|
|
3385
|
+
table_limit: int | None = Field(
|
|
3386
|
+
None, description='Optional cap on number of table assets per database', ge=1
|
|
3387
|
+
)
|
|
3388
|
+
|
|
3389
|
+
|
|
3390
|
+
class SparkCatalogOptional(BaseModel):
|
|
3391
|
+
model_config = ConfigDict(
|
|
3392
|
+
extra='forbid',
|
|
3393
|
+
)
|
|
3394
|
+
scope: SparkCatalogOptionalScope | None = None
|
|
3395
|
+
|
|
3396
|
+
|
|
3397
|
+
class SparkCatalogInput(CoreInput):
|
|
3398
|
+
type: Literal['SPARK_CATALOG'] | None = Field(
|
|
3399
|
+
None, description='Type of the asset or source'
|
|
3400
|
+
)
|
|
3401
|
+
required: SparkCatalogRequired
|
|
3402
|
+
masked: SparkCatalogMasked | None = None
|
|
3403
|
+
optional: SparkCatalogOptional | None = None
|
|
3404
|
+
detectors: list[Detector] | None = Field(
|
|
3405
|
+
None, description='Detectors to run on ingested content'
|
|
3406
|
+
)
|
|
3407
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3408
|
+
None,
|
|
3409
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3410
|
+
)
|
|
3411
|
+
sampling: SamplingConfig
|
|
3412
|
+
resources: ResourceOverrides | None = None
|
|
3413
|
+
|
|
3414
|
+
|
|
3415
|
+
class IcebergRequired(BaseModel):
|
|
3416
|
+
model_config = ConfigDict(
|
|
3417
|
+
extra='forbid',
|
|
3418
|
+
)
|
|
3419
|
+
catalog_type: IcebergCatalogType
|
|
3420
|
+
catalog_uri: str | None = Field(
|
|
3421
|
+
None,
|
|
3422
|
+
description='Catalog URI (REST endpoint, Hive metastore thrift URI, or SQL DSN). Not required for GLUE.',
|
|
3423
|
+
)
|
|
3424
|
+
warehouse: str = Field(
|
|
3425
|
+
..., description='Warehouse location root (e.g. s3://bucket/warehouse)'
|
|
3426
|
+
)
|
|
3427
|
+
|
|
3428
|
+
|
|
3429
|
+
class IcebergMasked(BaseModel):
|
|
3430
|
+
"""
|
|
3431
|
+
Optional Iceberg catalog/storage credentials.
|
|
3432
|
+
"""
|
|
3433
|
+
|
|
3434
|
+
model_config = ConfigDict(
|
|
3435
|
+
extra='forbid',
|
|
3436
|
+
)
|
|
3437
|
+
token: str | None = Field(None, description='Bearer token for a REST catalog')
|
|
3438
|
+
aws_access_key_id: str | None = Field(
|
|
3439
|
+
None, description='AWS access key id (Glue/S3)'
|
|
3440
|
+
)
|
|
3441
|
+
aws_secret_access_key: str | None = Field(
|
|
3442
|
+
None, description='AWS secret access key (Glue/S3)'
|
|
3443
|
+
)
|
|
3444
|
+
|
|
3445
|
+
|
|
3446
|
+
class IcebergOptionalScope(BaseModel):
|
|
3447
|
+
"""
|
|
3448
|
+
Iceberg namespace and table selection scope.
|
|
3449
|
+
"""
|
|
3450
|
+
|
|
3451
|
+
model_config = ConfigDict(
|
|
3452
|
+
extra='forbid',
|
|
3453
|
+
)
|
|
3454
|
+
namespace: str | None = Field(
|
|
3455
|
+
None, description='Single namespace to scan (dotted form supported)'
|
|
3456
|
+
)
|
|
3457
|
+
include_all_namespaces: bool | None = Field(
|
|
3458
|
+
False, description='Scan all visible namespaces'
|
|
3459
|
+
)
|
|
3460
|
+
include_tables: list[str] | None = Field(
|
|
3461
|
+
None,
|
|
3462
|
+
description='Optional table allowlist. Accepted forms: table or namespace.table',
|
|
3463
|
+
)
|
|
3464
|
+
table_limit: int | None = Field(
|
|
3465
|
+
None, description='Optional cap on number of table assets per namespace', ge=1
|
|
3466
|
+
)
|
|
3467
|
+
|
|
3468
|
+
|
|
3469
|
+
class IcebergOptional(BaseModel):
|
|
3470
|
+
model_config = ConfigDict(
|
|
3471
|
+
extra='forbid',
|
|
3472
|
+
)
|
|
3473
|
+
scope: IcebergOptionalScope | None = None
|
|
3474
|
+
|
|
3475
|
+
|
|
3476
|
+
class IcebergInput(CoreInput):
|
|
3477
|
+
type: Literal['ICEBERG'] | None = Field(
|
|
3478
|
+
None, description='Type of the asset or source'
|
|
3479
|
+
)
|
|
3480
|
+
required: IcebergRequired
|
|
3481
|
+
masked: IcebergMasked | None = None
|
|
3482
|
+
optional: IcebergOptional | None = None
|
|
3483
|
+
detectors: list[Detector] | None = Field(
|
|
3484
|
+
None, description='Detectors to run on ingested content'
|
|
3485
|
+
)
|
|
3486
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3487
|
+
None,
|
|
3488
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3489
|
+
)
|
|
3490
|
+
sampling: SamplingConfig
|
|
3491
|
+
resources: ResourceOverrides | None = None
|
|
3492
|
+
|
|
3493
|
+
|
|
3494
|
+
class KafkaRequired(BaseModel):
|
|
3495
|
+
model_config = ConfigDict(
|
|
3496
|
+
extra='forbid',
|
|
3497
|
+
)
|
|
3498
|
+
bootstrap_servers: str = Field(
|
|
3499
|
+
..., description='Comma-separated Kafka bootstrap servers (host:port)'
|
|
3500
|
+
)
|
|
3501
|
+
|
|
3502
|
+
|
|
3503
|
+
class KafkaMasked(BaseModel):
|
|
3504
|
+
"""
|
|
3505
|
+
Optional SASL credentials.
|
|
3506
|
+
"""
|
|
3507
|
+
|
|
3508
|
+
model_config = ConfigDict(
|
|
3509
|
+
extra='forbid',
|
|
3510
|
+
)
|
|
3511
|
+
sasl_username: str | None = Field(None, description='SASL username')
|
|
3512
|
+
sasl_password: str | None = Field(None, description='SASL password')
|
|
3513
|
+
|
|
3514
|
+
|
|
3515
|
+
class KafkaOptionalConnection(BaseModel):
|
|
3516
|
+
"""
|
|
3517
|
+
Kafka client connection and security options.
|
|
3518
|
+
"""
|
|
3519
|
+
|
|
3520
|
+
model_config = ConfigDict(
|
|
3521
|
+
extra='forbid',
|
|
3522
|
+
)
|
|
3523
|
+
security_protocol: KafkaSecurityProtocol | None = 'PLAINTEXT'
|
|
3524
|
+
sasl_mechanism: KafkaSaslMechanism | None = 'PLAIN'
|
|
3525
|
+
ssl_ca: str | None = Field(
|
|
3526
|
+
None, description='PEM-encoded CA certificate for TLS verification'
|
|
3527
|
+
)
|
|
3528
|
+
request_timeout_ms: int | None = Field(
|
|
3529
|
+
30000, description='Client request timeout in milliseconds', ge=1000
|
|
3530
|
+
)
|
|
3531
|
+
|
|
3532
|
+
|
|
3533
|
+
class KafkaOptionalScope(BaseModel):
|
|
3534
|
+
"""
|
|
3535
|
+
Kafka topic selection scope.
|
|
3536
|
+
"""
|
|
3537
|
+
|
|
3538
|
+
model_config = ConfigDict(
|
|
3539
|
+
extra='forbid',
|
|
3540
|
+
)
|
|
3541
|
+
include_topics: list[str] | None = Field(
|
|
3542
|
+
None, description='Optional topic allowlist'
|
|
3543
|
+
)
|
|
3544
|
+
exclude_topics: list[str] | None = Field(None, description='Topic denylist')
|
|
3545
|
+
include_internal: bool | None = Field(
|
|
3546
|
+
False, description='Include internal topics (names starting with __)'
|
|
3547
|
+
)
|
|
3548
|
+
topic_limit: int | None = Field(
|
|
3549
|
+
None, description='Optional cap on number of topic assets', ge=1
|
|
3550
|
+
)
|
|
3551
|
+
|
|
3552
|
+
|
|
3553
|
+
class KafkaOptional(BaseModel):
|
|
3554
|
+
model_config = ConfigDict(
|
|
3555
|
+
extra='forbid',
|
|
3556
|
+
)
|
|
3557
|
+
connection: KafkaOptionalConnection | None = None
|
|
3558
|
+
scope: KafkaOptionalScope | None = None
|
|
3559
|
+
|
|
3560
|
+
|
|
3561
|
+
class KafkaInput(CoreInput):
|
|
3562
|
+
type: Literal['KAFKA'] | None = Field(
|
|
3563
|
+
None, description='Type of the asset or source'
|
|
3564
|
+
)
|
|
3565
|
+
required: KafkaRequired
|
|
3566
|
+
masked: KafkaMasked | None = None
|
|
3567
|
+
optional: KafkaOptional | None = None
|
|
3568
|
+
detectors: list[Detector] | None = Field(
|
|
3569
|
+
None, description='Detectors to run on ingested content'
|
|
3570
|
+
)
|
|
3571
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3572
|
+
None,
|
|
3573
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3574
|
+
)
|
|
3575
|
+
sampling: SamplingConfig
|
|
3576
|
+
resources: ResourceOverrides | None = None
|
|
3577
|
+
|
|
3578
|
+
|
|
3096
3579
|
class YouTubeInput(CoreInput):
|
|
3097
3580
|
type: Literal['YOUTUBE'] | None = Field(
|
|
3098
3581
|
None, description='Type of the asset or source'
|
|
@@ -3136,6 +3619,11 @@ class SourceInput(
|
|
|
3136
3619
|
| NotionInput
|
|
3137
3620
|
| EmailInput
|
|
3138
3621
|
| YouTubeInput
|
|
3622
|
+
| DeltaLakeInput
|
|
3623
|
+
| IcebergInput
|
|
3624
|
+
| HudiInput
|
|
3625
|
+
| SparkCatalogInput
|
|
3626
|
+
| KafkaInput
|
|
3139
3627
|
]
|
|
3140
3628
|
):
|
|
3141
3629
|
root: (
|
|
@@ -3162,6 +3650,11 @@ class SourceInput(
|
|
|
3162
3650
|
| NotionInput
|
|
3163
3651
|
| EmailInput
|
|
3164
3652
|
| YouTubeInput
|
|
3653
|
+
| DeltaLakeInput
|
|
3654
|
+
| IcebergInput
|
|
3655
|
+
| HudiInput
|
|
3656
|
+
| SparkCatalogInput
|
|
3657
|
+
| KafkaInput
|
|
3165
3658
|
) = Field(
|
|
3166
3659
|
...,
|
|
3167
3660
|
description='Merged configuration schema with all source types and common definitions',
|
|
@@ -129,9 +129,7 @@ class FinalizeIngestRunRequest(BaseModel):
|
|
|
129
129
|
seen_hashes: list[str] = Field(serialization_alias="seenHashes")
|
|
130
130
|
# AUTOMATIC sampling cursor to persist on the source for the next run.
|
|
131
131
|
# Omitted (None) for other strategies so the stored cursor is left untouched.
|
|
132
|
-
sampling_cursor: dict[str, Any] | None = Field(
|
|
133
|
-
None, serialization_alias="samplingCursor"
|
|
134
|
-
)
|
|
132
|
+
sampling_cursor: dict[str, Any] | None = Field(None, serialization_alias="samplingCursor")
|
|
135
133
|
|
|
136
134
|
|
|
137
135
|
class UpdateRunnerStatusRequest(BaseModel):
|
|
@@ -133,9 +133,7 @@ class BaseSource(ABC):
|
|
|
133
133
|
saved = self._sampling_cursor.get(key)
|
|
134
134
|
return saved if isinstance(saved, int) and saved >= 0 else 0
|
|
135
135
|
|
|
136
|
-
def record_automatic_offset(
|
|
137
|
-
self, key: str, *, prev_offset: int, fetched: int
|
|
138
|
-
) -> None:
|
|
136
|
+
def record_automatic_offset(self, key: str, *, prev_offset: int, fetched: int) -> None:
|
|
139
137
|
"""Advance a keyed offset cursor; wrap to 0 once a page underfills.
|
|
140
138
|
|
|
141
139
|
Used by sources that page rows directly from the backing store
|