classifyre-cli 0.4.34__tar.gz → 0.4.35__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/.turbo/turbo-build.log +1 -1
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/PKG-INFO +1 -1
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/package.json +1 -1
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/pyproject.toml +7 -2
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/models/generated_input.py +319 -8
- classifyre_cli-0.4.35/src/sources/elasticsearch/__init__.py +3 -0
- classifyre_cli-0.4.35/src/sources/elasticsearch/source.py +31 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/kafka/source.py +27 -8
- classifyre_cli-0.4.35/src/sources/meilisearch/__init__.py +3 -0
- classifyre_cli-0.4.35/src/sources/meilisearch/source.py +353 -0
- classifyre_cli-0.4.35/src/sources/opensearch/__init__.py +3 -0
- classifyre_cli-0.4.35/src/sources/opensearch/source.py +32 -0
- classifyre_cli-0.4.35/src/sources/search_engine_base.py +345 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/spark_catalog/source.py +13 -5
- classifyre_cli-0.4.35/tests/test_elasticsearch_source.py +238 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_kafka_source.py +52 -1
- classifyre_cli-0.4.35/tests/test_meilisearch_source.py +255 -0
- classifyre_cli-0.4.35/tests/test_opensearch_source.py +228 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/uv.lock +282 -157
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/.gitignore +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/.python-version +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/README.md +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/main.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/scripts/generate_models.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/config.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/broken_links/detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/extractor.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/runners/_llm.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/runners/_text_classification.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/main.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/models/generated_detectors.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/models/generated_single_asset_scan_results.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/outputs/rest.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/pipeline/detector_pipeline.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/pipeline/worker_pool.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/asset_metadata.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/base.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/confluence/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/databricks/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/delta_lake/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/delta_lake/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/email/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/email/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/hive/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/hudi/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/hudi/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/iceberg/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/iceberg/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/jira/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/kafka/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/mongodb/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/mssql/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/mysql/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/neo4j/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/neo4j/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/notion/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/notion/client.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/notion/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/object_storage/base.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/oracle/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/postgresql/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/powerbi/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/s3_compatible_storage/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/servicedesk/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/slack/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/snowflake/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/spark_base.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/spark_catalog/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/sqlite/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/sqlite/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/tableau/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/tabular_base.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/tabular_utils.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/wordpress/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/youtube/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/sources/youtube/source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/dependency_groups.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/embedded_images.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/file_metadata.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/file_parser.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/file_to_images.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/resources.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/spark_runtime.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/transcription.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/_spark_fakes.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/custom/test_llm_runner.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/pipeline/test_detector_pipeline.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/pipeline/test_worker_pool.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_assets_metadata_catalog.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_config.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_custom_extractor.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_databricks_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_delta_lake_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_dependency_groups.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_email_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_hive_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_hudi_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_iceberg_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_mssql_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_mysql_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_notion_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_oracle_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_outputs.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_postgresql_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_s3_compatible_storage_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_sampling_automatic.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_sandbox_runner.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_snowflake_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_source_dependency_groups.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_spark_catalog_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_sqlite_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_tabular_automatic_sampling.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_uv_sync.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_youtube_source.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/test_youtube_source_integration.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/utils/test_embedded_images.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/utils/test_file_metadata.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/utils/test_file_parser.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/utils/test_file_to_images.py +0 -0
- {classifyre_cli-0.4.34 → classifyre_cli-0.4.35}/tests/utils/test_transcription.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "classifyre-cli"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.35"
|
|
4
4
|
description = "Classifyre CLI — scan and classify unstructured data sources"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -207,7 +207,12 @@ spark = [
|
|
|
207
207
|
# LTS) on the host; format JARs (Delta/Iceberg/Hudi) are resolved at runtime
|
|
208
208
|
# via spark.jars.packages (see SPARK_* env vars). Pinned to the Spark 4.1
|
|
209
209
|
# minor so the format-JAR coordinates below stay version-matched.
|
|
210
|
-
|
|
210
|
+
#
|
|
211
|
+
# The `connect` extra pulls in pandas>=2.2, pyarrow, and grpcio, which the
|
|
212
|
+
# Spark Connect client (Spark Catalog via sc:// URLs) hard-requires at session
|
|
213
|
+
# build time. Classic/local Spark (Delta/Hudi/Iceberg) does not need them, but
|
|
214
|
+
# they share this group, so we ship the client deps once for all of them.
|
|
215
|
+
"pyspark[connect]>=4.1,<4.2",
|
|
211
216
|
]
|
|
212
217
|
delta-lake = [
|
|
213
218
|
{ include-group = "spark" },
|
|
@@ -50,6 +50,9 @@ class AssetType(StrEnum):
|
|
|
50
50
|
HUDI = 'HUDI'
|
|
51
51
|
SPARK_CATALOG = 'SPARK_CATALOG'
|
|
52
52
|
KAFKA = 'KAFKA'
|
|
53
|
+
ELASTICSEARCH = 'ELASTICSEARCH'
|
|
54
|
+
OPENSEARCH = 'OPENSEARCH'
|
|
55
|
+
MEILISEARCH = 'MEILISEARCH'
|
|
53
56
|
|
|
54
57
|
|
|
55
58
|
class DetectorType(StrEnum):
|
|
@@ -353,6 +356,9 @@ class Type(StrEnum):
|
|
|
353
356
|
HUDI = 'HUDI'
|
|
354
357
|
SPARK_CATALOG = 'SPARK_CATALOG'
|
|
355
358
|
KAFKA = 'KAFKA'
|
|
359
|
+
ELASTICSEARCH = 'ELASTICSEARCH'
|
|
360
|
+
OPENSEARCH = 'OPENSEARCH'
|
|
361
|
+
MEILISEARCH = 'MEILISEARCH'
|
|
356
362
|
|
|
357
363
|
|
|
358
364
|
class YouTubeRequired(BaseModel):
|
|
@@ -2865,6 +2871,9 @@ class Type19(StrEnum):
|
|
|
2865
2871
|
HUDI = 'HUDI'
|
|
2866
2872
|
SPARK_CATALOG = 'SPARK_CATALOG'
|
|
2867
2873
|
KAFKA = 'KAFKA'
|
|
2874
|
+
ELASTICSEARCH = 'ELASTICSEARCH'
|
|
2875
|
+
OPENSEARCH = 'OPENSEARCH'
|
|
2876
|
+
MEILISEARCH = 'MEILISEARCH'
|
|
2868
2877
|
|
|
2869
2878
|
|
|
2870
2879
|
class ConfluenceInput(CoreInput):
|
|
@@ -3491,25 +3500,68 @@ class IcebergInput(CoreInput):
|
|
|
3491
3500
|
resources: ResourceOverrides | None = None
|
|
3492
3501
|
|
|
3493
3502
|
|
|
3494
|
-
class
|
|
3503
|
+
class NoAuthentication(BaseModel):
|
|
3495
3504
|
model_config = ConfigDict(
|
|
3496
3505
|
extra='forbid',
|
|
3497
3506
|
)
|
|
3507
|
+
auth_mode: Literal['NONE']
|
|
3498
3508
|
bootstrap_servers: str = Field(
|
|
3499
3509
|
..., description='Comma-separated Kafka bootstrap servers (host:port)'
|
|
3500
3510
|
)
|
|
3501
3511
|
|
|
3502
3512
|
|
|
3503
|
-
class
|
|
3513
|
+
class SASL(BaseModel):
|
|
3514
|
+
model_config = ConfigDict(
|
|
3515
|
+
extra='forbid',
|
|
3516
|
+
)
|
|
3517
|
+
auth_mode: Literal['SASL']
|
|
3518
|
+
bootstrap_servers: str = Field(
|
|
3519
|
+
..., description='Comma-separated Kafka bootstrap servers (host:port)'
|
|
3520
|
+
)
|
|
3521
|
+
|
|
3522
|
+
|
|
3523
|
+
class ClientCertificateMTLS(BaseModel):
|
|
3524
|
+
model_config = ConfigDict(
|
|
3525
|
+
extra='forbid',
|
|
3526
|
+
)
|
|
3527
|
+
auth_mode: Literal['CLIENT_CERT']
|
|
3528
|
+
bootstrap_servers: str = Field(
|
|
3529
|
+
..., description='Comma-separated Kafka bootstrap servers (host:port)'
|
|
3530
|
+
)
|
|
3531
|
+
|
|
3532
|
+
|
|
3533
|
+
class NoAuthentication1(BaseModel):
|
|
3534
|
+
model_config = ConfigDict(
|
|
3535
|
+
extra='forbid',
|
|
3536
|
+
)
|
|
3537
|
+
|
|
3538
|
+
|
|
3539
|
+
class SASL1(BaseModel):
|
|
3504
3540
|
"""
|
|
3505
|
-
|
|
3541
|
+
SASL username/password credentials.
|
|
3506
3542
|
"""
|
|
3507
3543
|
|
|
3508
3544
|
model_config = ConfigDict(
|
|
3509
3545
|
extra='forbid',
|
|
3510
3546
|
)
|
|
3511
|
-
sasl_username: str
|
|
3512
|
-
sasl_password: str
|
|
3547
|
+
sasl_username: str = Field(..., description='SASL username')
|
|
3548
|
+
sasl_password: str = Field(..., description='SASL password')
|
|
3549
|
+
|
|
3550
|
+
|
|
3551
|
+
class ClientCertificateMTLS1(BaseModel):
|
|
3552
|
+
"""
|
|
3553
|
+
mTLS client certificate credentials.
|
|
3554
|
+
"""
|
|
3555
|
+
|
|
3556
|
+
model_config = ConfigDict(
|
|
3557
|
+
extra='forbid',
|
|
3558
|
+
)
|
|
3559
|
+
ssl_certfile: str = Field(
|
|
3560
|
+
..., description='PEM-encoded client certificate (access cert)'
|
|
3561
|
+
)
|
|
3562
|
+
ssl_keyfile: str = Field(
|
|
3563
|
+
..., description='PEM-encoded client private key (access key)'
|
|
3564
|
+
)
|
|
3513
3565
|
|
|
3514
3566
|
|
|
3515
3567
|
class KafkaOptionalConnection(BaseModel):
|
|
@@ -3523,7 +3575,8 @@ class KafkaOptionalConnection(BaseModel):
|
|
|
3523
3575
|
security_protocol: KafkaSecurityProtocol | None = 'PLAINTEXT'
|
|
3524
3576
|
sasl_mechanism: KafkaSaslMechanism | None = 'PLAIN'
|
|
3525
3577
|
ssl_ca: str | None = Field(
|
|
3526
|
-
None,
|
|
3578
|
+
None,
|
|
3579
|
+
description="PEM-encoded CA certificate for TLS verification (optional for client-certificate auth; validates the broker's certificate)",
|
|
3527
3580
|
)
|
|
3528
3581
|
request_timeout_ms: int | None = Field(
|
|
3529
3582
|
30000, description='Client request timeout in milliseconds', ge=1000
|
|
@@ -3562,8 +3615,12 @@ class KafkaInput(CoreInput):
|
|
|
3562
3615
|
type: Literal['KAFKA'] | None = Field(
|
|
3563
3616
|
None, description='Type of the asset or source'
|
|
3564
3617
|
)
|
|
3565
|
-
required:
|
|
3566
|
-
|
|
3618
|
+
required: NoAuthentication | SASL | ClientCertificateMTLS = Field(
|
|
3619
|
+
..., title='KafkaRequired'
|
|
3620
|
+
)
|
|
3621
|
+
masked: NoAuthentication1 | SASL1 | ClientCertificateMTLS1 | None = Field(
|
|
3622
|
+
None, title='KafkaMasked'
|
|
3623
|
+
)
|
|
3567
3624
|
optional: KafkaOptional | None = None
|
|
3568
3625
|
detectors: list[Detector] | None = Field(
|
|
3569
3626
|
None, description='Detectors to run on ingested content'
|
|
@@ -3576,6 +3633,254 @@ class KafkaInput(CoreInput):
|
|
|
3576
3633
|
resources: ResourceOverrides | None = None
|
|
3577
3634
|
|
|
3578
3635
|
|
|
3636
|
+
class NoAuthentication2(BaseModel):
|
|
3637
|
+
model_config = ConfigDict(
|
|
3638
|
+
extra='forbid',
|
|
3639
|
+
)
|
|
3640
|
+
auth_mode: Literal['NONE']
|
|
3641
|
+
url: AnyUrl = Field(
|
|
3642
|
+
..., description='Base URL of the cluster (e.g. https://localhost:9200)'
|
|
3643
|
+
)
|
|
3644
|
+
|
|
3645
|
+
|
|
3646
|
+
class BasicUsernamePassword(BaseModel):
|
|
3647
|
+
model_config = ConfigDict(
|
|
3648
|
+
extra='forbid',
|
|
3649
|
+
)
|
|
3650
|
+
auth_mode: Literal['BASIC']
|
|
3651
|
+
url: AnyUrl = Field(
|
|
3652
|
+
..., description='Base URL of the cluster (e.g. https://localhost:9200)'
|
|
3653
|
+
)
|
|
3654
|
+
|
|
3655
|
+
|
|
3656
|
+
class APIKeyBearerToken(BaseModel):
|
|
3657
|
+
model_config = ConfigDict(
|
|
3658
|
+
extra='forbid',
|
|
3659
|
+
)
|
|
3660
|
+
auth_mode: Literal['API_KEY']
|
|
3661
|
+
url: AnyUrl = Field(
|
|
3662
|
+
..., description='Base URL of the cluster (e.g. https://localhost:9200)'
|
|
3663
|
+
)
|
|
3664
|
+
|
|
3665
|
+
|
|
3666
|
+
class NoAuthentication3(BaseModel):
|
|
3667
|
+
model_config = ConfigDict(
|
|
3668
|
+
extra='forbid',
|
|
3669
|
+
)
|
|
3670
|
+
|
|
3671
|
+
|
|
3672
|
+
class BasicUsernamePassword1(BaseModel):
|
|
3673
|
+
"""
|
|
3674
|
+
Basic auth credentials.
|
|
3675
|
+
"""
|
|
3676
|
+
|
|
3677
|
+
model_config = ConfigDict(
|
|
3678
|
+
extra='forbid',
|
|
3679
|
+
)
|
|
3680
|
+
username: str = Field(..., description='Basic auth username')
|
|
3681
|
+
password: str = Field(..., description='Basic auth password')
|
|
3682
|
+
|
|
3683
|
+
|
|
3684
|
+
class APIKeyBearerToken1(BaseModel):
|
|
3685
|
+
"""
|
|
3686
|
+
API key / bearer token credential.
|
|
3687
|
+
"""
|
|
3688
|
+
|
|
3689
|
+
model_config = ConfigDict(
|
|
3690
|
+
extra='forbid',
|
|
3691
|
+
)
|
|
3692
|
+
api_key: str = Field(
|
|
3693
|
+
..., description='API key or bearer token, sent as an Authorization header'
|
|
3694
|
+
)
|
|
3695
|
+
|
|
3696
|
+
|
|
3697
|
+
class SearchEngineOptionalConnection(BaseModel):
|
|
3698
|
+
"""
|
|
3699
|
+
Cluster connection controls.
|
|
3700
|
+
"""
|
|
3701
|
+
|
|
3702
|
+
model_config = ConfigDict(
|
|
3703
|
+
extra='forbid',
|
|
3704
|
+
)
|
|
3705
|
+
verify_ssl: bool | None = Field(
|
|
3706
|
+
True, description='TLS certificate verification toggle'
|
|
3707
|
+
)
|
|
3708
|
+
request_timeout_seconds: float | None = Field(
|
|
3709
|
+
30,
|
|
3710
|
+
description='Network timeout in seconds for cluster API calls',
|
|
3711
|
+
ge=1.0,
|
|
3712
|
+
le=300.0,
|
|
3713
|
+
)
|
|
3714
|
+
|
|
3715
|
+
|
|
3716
|
+
class SearchEngineOptionalScope(BaseModel):
|
|
3717
|
+
"""
|
|
3718
|
+
Index selection scope.
|
|
3719
|
+
"""
|
|
3720
|
+
|
|
3721
|
+
model_config = ConfigDict(
|
|
3722
|
+
extra='forbid',
|
|
3723
|
+
)
|
|
3724
|
+
include_indices: list[str] | None = Field(
|
|
3725
|
+
None, description='Optional index allowlist'
|
|
3726
|
+
)
|
|
3727
|
+
exclude_indices: list[str] | None = Field(None, description='Index denylist')
|
|
3728
|
+
include_system_indices: bool | None = Field(
|
|
3729
|
+
False, description='Include system indices (names starting with .)'
|
|
3730
|
+
)
|
|
3731
|
+
index_limit: int | None = Field(
|
|
3732
|
+
None, description='Optional cap on number of index assets', ge=1
|
|
3733
|
+
)
|
|
3734
|
+
|
|
3735
|
+
|
|
3736
|
+
class ElasticsearchOptional(BaseModel):
|
|
3737
|
+
model_config = ConfigDict(
|
|
3738
|
+
extra='forbid',
|
|
3739
|
+
)
|
|
3740
|
+
connection: SearchEngineOptionalConnection | None = None
|
|
3741
|
+
scope: SearchEngineOptionalScope | None = None
|
|
3742
|
+
|
|
3743
|
+
|
|
3744
|
+
class ElasticsearchInput(CoreInput):
|
|
3745
|
+
type: Literal['ELASTICSEARCH'] | None = Field(
|
|
3746
|
+
None, description='Type of the asset or source'
|
|
3747
|
+
)
|
|
3748
|
+
required: NoAuthentication2 | BasicUsernamePassword | APIKeyBearerToken = Field(
|
|
3749
|
+
..., title='ElasticsearchRequired'
|
|
3750
|
+
)
|
|
3751
|
+
masked: NoAuthentication3 | BasicUsernamePassword1 | APIKeyBearerToken1 | None = (
|
|
3752
|
+
Field(None, title='ElasticsearchMasked')
|
|
3753
|
+
)
|
|
3754
|
+
optional: ElasticsearchOptional | None = None
|
|
3755
|
+
detectors: list[Detector] | None = Field(
|
|
3756
|
+
None, description='Detectors to run on ingested content'
|
|
3757
|
+
)
|
|
3758
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3759
|
+
None,
|
|
3760
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3761
|
+
)
|
|
3762
|
+
sampling: SamplingConfig
|
|
3763
|
+
resources: ResourceOverrides | None = None
|
|
3764
|
+
|
|
3765
|
+
|
|
3766
|
+
class OpenSearchOptional(BaseModel):
|
|
3767
|
+
model_config = ConfigDict(
|
|
3768
|
+
extra='forbid',
|
|
3769
|
+
)
|
|
3770
|
+
connection: SearchEngineOptionalConnection | None = None
|
|
3771
|
+
scope: SearchEngineOptionalScope | None = None
|
|
3772
|
+
|
|
3773
|
+
|
|
3774
|
+
class OpenSearchInput(CoreInput):
|
|
3775
|
+
type: Literal['OPENSEARCH'] | None = Field(
|
|
3776
|
+
None, description='Type of the asset or source'
|
|
3777
|
+
)
|
|
3778
|
+
required: NoAuthentication2 | BasicUsernamePassword | APIKeyBearerToken = Field(
|
|
3779
|
+
..., title='OpenSearchRequired'
|
|
3780
|
+
)
|
|
3781
|
+
masked: NoAuthentication3 | BasicUsernamePassword1 | APIKeyBearerToken1 | None = (
|
|
3782
|
+
Field(None, title='OpenSearchMasked')
|
|
3783
|
+
)
|
|
3784
|
+
optional: OpenSearchOptional | None = None
|
|
3785
|
+
detectors: list[Detector] | None = Field(
|
|
3786
|
+
None, description='Detectors to run on ingested content'
|
|
3787
|
+
)
|
|
3788
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3789
|
+
None,
|
|
3790
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3791
|
+
)
|
|
3792
|
+
sampling: SamplingConfig
|
|
3793
|
+
resources: ResourceOverrides | None = None
|
|
3794
|
+
|
|
3795
|
+
|
|
3796
|
+
class NoAuthentication4(BaseModel):
|
|
3797
|
+
model_config = ConfigDict(
|
|
3798
|
+
extra='forbid',
|
|
3799
|
+
)
|
|
3800
|
+
auth_mode: Literal['NONE']
|
|
3801
|
+
url: AnyUrl = Field(
|
|
3802
|
+
...,
|
|
3803
|
+
description='Base URL of the Meilisearch instance (e.g. http://localhost:7700)',
|
|
3804
|
+
)
|
|
3805
|
+
|
|
3806
|
+
|
|
3807
|
+
class APIKeyBearerToken2(BaseModel):
|
|
3808
|
+
model_config = ConfigDict(
|
|
3809
|
+
extra='forbid',
|
|
3810
|
+
)
|
|
3811
|
+
auth_mode: Literal['API_KEY']
|
|
3812
|
+
url: AnyUrl = Field(
|
|
3813
|
+
...,
|
|
3814
|
+
description='Base URL of the Meilisearch instance (e.g. http://localhost:7700)',
|
|
3815
|
+
)
|
|
3816
|
+
|
|
3817
|
+
|
|
3818
|
+
class NoAuthentication5(BaseModel):
|
|
3819
|
+
model_config = ConfigDict(
|
|
3820
|
+
extra='forbid',
|
|
3821
|
+
)
|
|
3822
|
+
|
|
3823
|
+
|
|
3824
|
+
class APIKeyBearerToken3(BaseModel):
|
|
3825
|
+
"""
|
|
3826
|
+
Meilisearch API key or master key, sent as an Authorization: Bearer header. Meilisearch has no separate username/password authentication mode.
|
|
3827
|
+
"""
|
|
3828
|
+
|
|
3829
|
+
model_config = ConfigDict(
|
|
3830
|
+
extra='forbid',
|
|
3831
|
+
)
|
|
3832
|
+
api_key: str = Field(..., description='API key or master key')
|
|
3833
|
+
|
|
3834
|
+
|
|
3835
|
+
class MeilisearchOptionalScope(BaseModel):
|
|
3836
|
+
"""
|
|
3837
|
+
Index selection scope.
|
|
3838
|
+
"""
|
|
3839
|
+
|
|
3840
|
+
model_config = ConfigDict(
|
|
3841
|
+
extra='forbid',
|
|
3842
|
+
)
|
|
3843
|
+
include_indices: list[str] | None = Field(
|
|
3844
|
+
None, description='Optional index allowlist (matches index uid)'
|
|
3845
|
+
)
|
|
3846
|
+
exclude_indices: list[str] | None = Field(
|
|
3847
|
+
None, description='Index denylist (matches index uid)'
|
|
3848
|
+
)
|
|
3849
|
+
index_limit: int | None = Field(
|
|
3850
|
+
None, description='Optional cap on number of index assets', ge=1
|
|
3851
|
+
)
|
|
3852
|
+
|
|
3853
|
+
|
|
3854
|
+
class MeilisearchOptional(BaseModel):
|
|
3855
|
+
model_config = ConfigDict(
|
|
3856
|
+
extra='forbid',
|
|
3857
|
+
)
|
|
3858
|
+
connection: SearchEngineOptionalConnection | None = None
|
|
3859
|
+
scope: MeilisearchOptionalScope | None = None
|
|
3860
|
+
|
|
3861
|
+
|
|
3862
|
+
class MeilisearchInput(CoreInput):
|
|
3863
|
+
type: Literal['MEILISEARCH'] | None = Field(
|
|
3864
|
+
None, description='Type of the asset or source'
|
|
3865
|
+
)
|
|
3866
|
+
required: NoAuthentication4 | APIKeyBearerToken2 = Field(
|
|
3867
|
+
..., title='MeilisearchRequired'
|
|
3868
|
+
)
|
|
3869
|
+
masked: NoAuthentication5 | APIKeyBearerToken3 | None = Field(
|
|
3870
|
+
None, title='MeilisearchMasked'
|
|
3871
|
+
)
|
|
3872
|
+
optional: MeilisearchOptional | None = None
|
|
3873
|
+
detectors: list[Detector] | None = Field(
|
|
3874
|
+
None, description='Detectors to run on ingested content'
|
|
3875
|
+
)
|
|
3876
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3877
|
+
None,
|
|
3878
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3879
|
+
)
|
|
3880
|
+
sampling: SamplingConfig
|
|
3881
|
+
resources: ResourceOverrides | None = None
|
|
3882
|
+
|
|
3883
|
+
|
|
3579
3884
|
class YouTubeInput(CoreInput):
|
|
3580
3885
|
type: Literal['YOUTUBE'] | None = Field(
|
|
3581
3886
|
None, description='Type of the asset or source'
|
|
@@ -3624,6 +3929,9 @@ class SourceInput(
|
|
|
3624
3929
|
| HudiInput
|
|
3625
3930
|
| SparkCatalogInput
|
|
3626
3931
|
| KafkaInput
|
|
3932
|
+
| ElasticsearchInput
|
|
3933
|
+
| OpenSearchInput
|
|
3934
|
+
| MeilisearchInput
|
|
3627
3935
|
]
|
|
3628
3936
|
):
|
|
3629
3937
|
root: (
|
|
@@ -3655,6 +3963,9 @@ class SourceInput(
|
|
|
3655
3963
|
| HudiInput
|
|
3656
3964
|
| SparkCatalogInput
|
|
3657
3965
|
| KafkaInput
|
|
3966
|
+
| ElasticsearchInput
|
|
3967
|
+
| OpenSearchInput
|
|
3968
|
+
| MeilisearchInput
|
|
3658
3969
|
) = Field(
|
|
3659
3970
|
...,
|
|
3660
3971
|
description='Merged configuration schema with all source types and common definitions',
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Elasticsearch source — discovers indices and samples documents.
|
|
2
|
+
|
|
3
|
+
Uses plain REST calls (``requests``) rather than the ``elasticsearch-py``
|
|
4
|
+
client, since only read-only cluster/index/search endpoints are needed and
|
|
5
|
+
those are stable across Elasticsearch versions. Shared with ``OpenSearchSource``
|
|
6
|
+
via :mod:`src.sources.search_engine_base` — see that module for the REST logic.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from ...models.generated_input import ElasticsearchInput
|
|
14
|
+
from ..base import BaseSource
|
|
15
|
+
from ..search_engine_base import SearchEngineSourceMixin
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ElasticsearchSource(SearchEngineSourceMixin, BaseSource):
|
|
19
|
+
source_type = "elasticsearch"
|
|
20
|
+
ENGINE_LABEL = "Elasticsearch"
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
recipe: dict[str, Any],
|
|
25
|
+
source_id: str | None = None,
|
|
26
|
+
runner_id: str | None = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
super().__init__(recipe, source_id, runner_id)
|
|
29
|
+
self.config = ElasticsearchInput.model_validate(recipe)
|
|
30
|
+
self.runner_id = runner_id or "local-run"
|
|
31
|
+
self._index_lookup: dict[str, str] = {}
|
|
@@ -9,6 +9,7 @@ from __future__ import annotations
|
|
|
9
9
|
|
|
10
10
|
import logging
|
|
11
11
|
import ssl as ssl_module
|
|
12
|
+
import tempfile
|
|
12
13
|
from collections.abc import AsyncGenerator
|
|
13
14
|
from datetime import UTC, datetime
|
|
14
15
|
from typing import Any
|
|
@@ -67,6 +68,7 @@ class KafkaSource(BaseSource):
|
|
|
67
68
|
def _client_kwargs(self) -> dict[str, Any]:
|
|
68
69
|
kwargs: dict[str, Any] = {"bootstrap_servers": self._bootstrap_servers()}
|
|
69
70
|
connection = self._connection()
|
|
71
|
+
ssl_ca = getattr(connection, "ssl_ca", None) if connection is not None else None
|
|
70
72
|
if connection is not None:
|
|
71
73
|
protocol = getattr(connection, "security_protocol", None)
|
|
72
74
|
if protocol is not None:
|
|
@@ -80,17 +82,34 @@ class KafkaSource(BaseSource):
|
|
|
80
82
|
)
|
|
81
83
|
if getattr(connection, "request_timeout_ms", None):
|
|
82
84
|
kwargs["request_timeout_ms"] = int(connection.request_timeout_ms)
|
|
83
|
-
if getattr(connection, "ssl_ca", None):
|
|
84
|
-
context = ssl_module.create_default_context(cadata=connection.ssl_ca)
|
|
85
|
-
kwargs["ssl_context"] = context
|
|
86
85
|
masked = self.config.masked
|
|
87
|
-
if masked
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
86
|
+
if getattr(masked, "sasl_username", None):
|
|
87
|
+
kwargs["sasl_plain_username"] = masked.sasl_username
|
|
88
|
+
if getattr(masked, "sasl_password", None):
|
|
89
|
+
kwargs["sasl_plain_password"] = masked.sasl_password
|
|
90
|
+
ssl_certfile = getattr(masked, "ssl_certfile", None)
|
|
91
|
+
ssl_keyfile = getattr(masked, "ssl_keyfile", None)
|
|
92
|
+
if ssl_ca or ssl_certfile:
|
|
93
|
+
context = ssl_module.create_default_context(cadata=ssl_ca)
|
|
94
|
+
if ssl_certfile and ssl_keyfile:
|
|
95
|
+
self._load_client_cert_chain(context, ssl_certfile, ssl_keyfile)
|
|
96
|
+
kwargs["ssl_context"] = context
|
|
92
97
|
return kwargs
|
|
93
98
|
|
|
99
|
+
@staticmethod
|
|
100
|
+
def _load_client_cert_chain(
|
|
101
|
+
context: ssl_module.SSLContext, certfile: str, keyfile: str
|
|
102
|
+
) -> None:
|
|
103
|
+
with (
|
|
104
|
+
tempfile.NamedTemporaryFile("w", suffix=".pem") as cert_tmp,
|
|
105
|
+
tempfile.NamedTemporaryFile("w", suffix=".pem") as key_tmp,
|
|
106
|
+
):
|
|
107
|
+
cert_tmp.write(certfile)
|
|
108
|
+
cert_tmp.flush()
|
|
109
|
+
key_tmp.write(keyfile)
|
|
110
|
+
key_tmp.flush()
|
|
111
|
+
context.load_cert_chain(certfile=cert_tmp.name, keyfile=key_tmp.name)
|
|
112
|
+
|
|
94
113
|
def _make_consumer(self, **extra: Any) -> Any:
|
|
95
114
|
kwargs = {**self._client_kwargs(), "enable_auto_commit": False, **extra}
|
|
96
115
|
return self._kafka.KafkaConsumer(**kwargs)
|