classifyre-cli 0.4.17__tar.gz → 0.4.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/.turbo/turbo-build.log +1 -1
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/PKG-INFO +1 -1
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/package.json +1 -1
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/pyproject.toml +22 -1
- classifyre_cli-0.4.19/src/config.py +76 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/models/generated_input.py +138 -41
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/base.py +8 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/object_storage/base.py +10 -8
- classifyre_cli-0.4.19/src/sources/youtube/__init__.py +3 -0
- classifyre_cli-0.4.19/src/sources/youtube/source.py +589 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/utils/dependency_groups.py +6 -2
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/utils/file_parser.py +65 -4
- classifyre_cli-0.4.19/src/utils/transcription.py +177 -0
- classifyre_cli-0.4.19/tests/test_config.py +64 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_dependency_groups.py +9 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_s3_compatible_storage_source.py +2 -0
- classifyre_cli-0.4.19/tests/test_source_dependency_groups.py +266 -0
- classifyre_cli-0.4.19/tests/test_youtube_source.py +247 -0
- classifyre_cli-0.4.19/tests/test_youtube_source_integration.py +77 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/utils/test_file_parser.py +76 -0
- classifyre_cli-0.4.19/tests/utils/test_transcription.py +92 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/uv.lock +454 -199
- classifyre_cli-0.4.17/tests/test_source_dependency_groups.py +0 -74
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/.gitignore +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/.python-version +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/README.md +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/main.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/scripts/generate_models.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/broken_links/detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/extractor.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/runners/_llm.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/runners/_text_classification.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/main.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/models/generated_detectors.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/models/generated_single_asset_scan_results.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/outputs/rest.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/pipeline/detector_pipeline.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/pipeline/worker_pool.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/asset_metadata.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/confluence/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/databricks/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/email/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/email/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/hive/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/jira/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/mongodb/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/mssql/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/mysql/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/neo4j/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/neo4j/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/notion/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/notion/client.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/notion/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/oracle/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/postgresql/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/powerbi/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/s3_compatible_storage/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/servicedesk/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/slack/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/snowflake/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/sqlite/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/sqlite/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/tableau/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/tabular_base.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/tabular_utils.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/sources/wordpress/source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/utils/embedded_images.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/utils/file_metadata.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/utils/file_to_images.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/custom/test_llm_runner.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/pipeline/test_detector_pipeline.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/pipeline/test_worker_pool.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_assets_metadata_catalog.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_custom_extractor.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_databricks_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_email_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_hive_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_mssql_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_mysql_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_notion_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_oracle_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_outputs.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_postgresql_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_sandbox_runner.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_snowflake_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_sqlite_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_uv_sync.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/utils/test_embedded_images.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/utils/test_file_metadata.py +0 -0
- {classifyre_cli-0.4.17 → classifyre_cli-0.4.19}/tests/utils/test_file_to_images.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "classifyre-cli"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.19"
|
|
4
4
|
description = "Classifyre CLI — scan and classify unstructured data sources"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -100,6 +100,9 @@ custom = [
|
|
|
100
100
|
"scikit-learn>=1.7.2",
|
|
101
101
|
"gliner2>=1.3.1",
|
|
102
102
|
"sentence-transformers>=5.5.0",
|
|
103
|
+
# Direct declaration so the [tool.uv.sources] cpu-index pin applies (timm
|
|
104
|
+
# pulls torchvision transitively; see the ocr group note).
|
|
105
|
+
"torchvision>=0.27.0",
|
|
103
106
|
]
|
|
104
107
|
regex = [
|
|
105
108
|
"google-re2>=1.1",
|
|
@@ -122,6 +125,7 @@ detectors = [
|
|
|
122
125
|
{ include-group = "custom" },
|
|
123
126
|
{ include-group = "regex" },
|
|
124
127
|
{ include-group = "llm" },
|
|
128
|
+
{ include-group = "transcription" },
|
|
125
129
|
]
|
|
126
130
|
file-processing = [
|
|
127
131
|
"filetype>=1.2.0",
|
|
@@ -131,10 +135,21 @@ file-processing = [
|
|
|
131
135
|
"chardet>=7.4.3",
|
|
132
136
|
"pyarrow>=18.0.0",
|
|
133
137
|
]
|
|
138
|
+
transcription = [
|
|
139
|
+
# CPU audio/video transcription. faster-whisper decodes media via bundled
|
|
140
|
+
# PyAV (no system ffmpeg required) and runs ctranslate2 for inference.
|
|
141
|
+
"faster-whisper>=1.1.0",
|
|
142
|
+
]
|
|
134
143
|
ocr = [
|
|
135
144
|
{ include-group = "file-processing" },
|
|
136
145
|
"docling>=2.94.0",
|
|
137
146
|
"rapidocr-onnxruntime>=1.4.0",
|
|
147
|
+
# torchvision is a transitive dep of docling-ibm-models, but [tool.uv.sources]
|
|
148
|
+
# index pins only apply to dependencies declared directly in this project.
|
|
149
|
+
# Without this entry uv resolves the CUDA-built PyPI wheel, which fails to
|
|
150
|
+
# import against torch+cpu (surfaces as transformers "Could not import
|
|
151
|
+
# module 'AutoProcessor'").
|
|
152
|
+
"torchvision>=0.27.0",
|
|
138
153
|
]
|
|
139
154
|
postgresql = [
|
|
140
155
|
"psycopg2-binary>=2.9.12,<3.0.0",
|
|
@@ -183,6 +198,10 @@ google-cloud-storage = [
|
|
|
183
198
|
email = [
|
|
184
199
|
"imap-tools>=1.10.0,<2.0.0",
|
|
185
200
|
]
|
|
201
|
+
youtube = [
|
|
202
|
+
"yt-dlp>=2025.1.0",
|
|
203
|
+
"youtube-transcript-api>=1.0.0",
|
|
204
|
+
]
|
|
186
205
|
otel = [
|
|
187
206
|
"opentelemetry-sdk>=1.42.0",
|
|
188
207
|
"opentelemetry-exporter-otlp-proto-http>=1.27.0",
|
|
@@ -300,6 +319,8 @@ module = [
|
|
|
300
319
|
"tldextract.*",
|
|
301
320
|
"re2",
|
|
302
321
|
"re2.*",
|
|
322
|
+
"faster_whisper",
|
|
323
|
+
"faster_whisper.*",
|
|
303
324
|
]
|
|
304
325
|
ignore_missing_imports = true
|
|
305
326
|
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Runtime configuration loaded from environment variables.
|
|
2
|
+
|
|
3
|
+
A central, reusable place for tunables that may be overridden via `.env`
|
|
4
|
+
(loaded by ``src.main.load_local_env``) without touching source recipes. Each
|
|
5
|
+
concrete config section is a small pydantic model with a cached accessor so the
|
|
6
|
+
environment is read once per process.
|
|
7
|
+
|
|
8
|
+
Future sources/processors can add their own sections here following the same
|
|
9
|
+
pattern (``BaseModel`` + ``functools.lru_cache`` accessor).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
from functools import lru_cache
|
|
16
|
+
|
|
17
|
+
from pydantic import BaseModel, Field
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _env_str(name: str, default: str) -> str:
|
|
21
|
+
value = os.environ.get(name)
|
|
22
|
+
if value is None:
|
|
23
|
+
return default
|
|
24
|
+
value = value.strip()
|
|
25
|
+
return value or default
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _env_int(name: str, default: int) -> int:
|
|
29
|
+
raw = os.environ.get(name)
|
|
30
|
+
if raw is None or not raw.strip():
|
|
31
|
+
return default
|
|
32
|
+
try:
|
|
33
|
+
return int(raw.strip())
|
|
34
|
+
except ValueError:
|
|
35
|
+
return default
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _env_bool(name: str, default: bool) -> bool:
|
|
39
|
+
raw = os.environ.get(name)
|
|
40
|
+
if raw is None or not raw.strip():
|
|
41
|
+
return default
|
|
42
|
+
return raw.strip().lower() in {"1", "true", "yes", "on"}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class WhisperConfig(BaseModel):
|
|
46
|
+
"""faster-whisper transcription settings (CPU-only defaults).
|
|
47
|
+
|
|
48
|
+
Overridable via environment so a deployment can trade speed for accuracy
|
|
49
|
+
(e.g. a larger model, GPU device, or float16 compute) without code changes.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
model: str = Field(
|
|
53
|
+
"medium", description="Whisper model size or path (e.g. tiny, base, medium, large-v3)."
|
|
54
|
+
)
|
|
55
|
+
device: str = Field("cpu", description="Inference device: cpu, cuda, or auto.")
|
|
56
|
+
compute_type: str = Field(
|
|
57
|
+
"int8", description="ctranslate2 compute type: int8, int8_float16, float16, float32."
|
|
58
|
+
)
|
|
59
|
+
beam_size: int = Field(5, ge=1, description="Beam search width.")
|
|
60
|
+
vad_filter: bool = Field(
|
|
61
|
+
True, description="Drop non-speech segments with Silero VAD before decoding."
|
|
62
|
+
)
|
|
63
|
+
word_timestamps: bool = Field(True, description="Emit per-word timestamps during decoding.")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@lru_cache(maxsize=1)
|
|
67
|
+
def get_whisper_config() -> WhisperConfig:
|
|
68
|
+
"""Return the process-wide WhisperConfig, populated from the environment."""
|
|
69
|
+
return WhisperConfig(
|
|
70
|
+
model=_env_str("CLASSIFYRE_WHISPER_MODEL", "medium"),
|
|
71
|
+
device=_env_str("CLASSIFYRE_WHISPER_DEVICE", "cpu"),
|
|
72
|
+
compute_type=_env_str("CLASSIFYRE_WHISPER_COMPUTE_TYPE", "int8"),
|
|
73
|
+
beam_size=_env_int("CLASSIFYRE_WHISPER_BEAM_SIZE", 5),
|
|
74
|
+
vad_filter=_env_bool("CLASSIFYRE_WHISPER_VAD_FILTER", True),
|
|
75
|
+
word_timestamps=_env_bool("CLASSIFYRE_WHISPER_WORD_TIMESTAMPS", True),
|
|
76
|
+
)
|
|
@@ -44,15 +44,7 @@ class AssetType(StrEnum):
|
|
|
44
44
|
SQLITE = 'SQLITE'
|
|
45
45
|
NOTION = 'NOTION'
|
|
46
46
|
EMAIL = 'EMAIL'
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class SourceCategory(StrEnum):
|
|
50
|
-
"""
|
|
51
|
-
Category of the source: TABULAR for structured databases (PostgreSQL, MySQL, MSSQL, Oracle, Hive, Databricks Unity Catalog, Snowflake), UNSTRUCTURED for text/web/document sources (WordPress, S3-Compatible Storage, Azure Blob Storage, Google Cloud Storage, Slack, MongoDB, PowerBI, Tableau, Confluence, Jira, Service Desk)
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
TABULAR = 'TABULAR'
|
|
55
|
-
UNSTRUCTURED = 'UNSTRUCTURED'
|
|
47
|
+
YOUTUBE = 'YOUTUBE'
|
|
56
48
|
|
|
57
49
|
|
|
58
50
|
class DetectorType(StrEnum):
|
|
@@ -114,6 +106,10 @@ class SamplingConfig(BaseModel):
|
|
|
114
106
|
False,
|
|
115
107
|
description='When true, enable OCR/text extraction for supported binary documents and images before routing text-capable detectors.',
|
|
116
108
|
)
|
|
109
|
+
enable_transcription: bool | None = Field(
|
|
110
|
+
False,
|
|
111
|
+
description='When true, transcribe audio and video files to text (via faster-whisper) before routing text-capable detectors. Slower and requires the transcription dependency.',
|
|
112
|
+
)
|
|
117
113
|
order_by_column: str | None = Field(
|
|
118
114
|
None,
|
|
119
115
|
description='Column to use for LATEST sampling mode in tabular sources (usually created_at/updated_at). Auto-detected when not set.',
|
|
@@ -318,6 +314,115 @@ class EmailOptional(BaseModel):
|
|
|
318
314
|
scope: EmailOptionalScope | None = None
|
|
319
315
|
|
|
320
316
|
|
|
317
|
+
class Type(StrEnum):
|
|
318
|
+
"""
|
|
319
|
+
Type of the asset or source
|
|
320
|
+
"""
|
|
321
|
+
|
|
322
|
+
WORDPRESS = 'WORDPRESS'
|
|
323
|
+
SLACK = 'SLACK'
|
|
324
|
+
S3_COMPATIBLE_STORAGE = 'S3_COMPATIBLE_STORAGE'
|
|
325
|
+
AZURE_BLOB_STORAGE = 'AZURE_BLOB_STORAGE'
|
|
326
|
+
GOOGLE_CLOUD_STORAGE = 'GOOGLE_CLOUD_STORAGE'
|
|
327
|
+
POSTGRESQL = 'POSTGRESQL'
|
|
328
|
+
MYSQL = 'MYSQL'
|
|
329
|
+
MSSQL = 'MSSQL'
|
|
330
|
+
ORACLE = 'ORACLE'
|
|
331
|
+
HIVE = 'HIVE'
|
|
332
|
+
DATABRICKS = 'DATABRICKS'
|
|
333
|
+
SNOWFLAKE = 'SNOWFLAKE'
|
|
334
|
+
MONGODB = 'MONGODB'
|
|
335
|
+
NEO4J = 'NEO4J'
|
|
336
|
+
POWERBI = 'POWERBI'
|
|
337
|
+
TABLEAU = 'TABLEAU'
|
|
338
|
+
CONFLUENCE = 'CONFLUENCE'
|
|
339
|
+
JIRA = 'JIRA'
|
|
340
|
+
SERVICEDESK = 'SERVICEDESK'
|
|
341
|
+
SQLITE = 'SQLITE'
|
|
342
|
+
NOTION = 'NOTION'
|
|
343
|
+
EMAIL = 'EMAIL'
|
|
344
|
+
YOUTUBE = 'YOUTUBE'
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
class YouTubeRequired(BaseModel):
|
|
348
|
+
"""
|
|
349
|
+
Provide at least one of channels or video_urls (enforced at runtime).
|
|
350
|
+
"""
|
|
351
|
+
|
|
352
|
+
model_config = ConfigDict(
|
|
353
|
+
extra='forbid',
|
|
354
|
+
)
|
|
355
|
+
channels: list[str] | None = Field(
|
|
356
|
+
None,
|
|
357
|
+
description='Channel URLs or handles to list videos from (e.g. https://www.youtube.com/@OpenAI or @OpenAI). At least one of channels/video_urls is required.',
|
|
358
|
+
)
|
|
359
|
+
video_urls: list[str] | None = Field(
|
|
360
|
+
None,
|
|
361
|
+
description='Explicit video watch URLs to scan (e.g. https://www.youtube.com/watch?v=dQw4w9WgXcQ). At least one of channels/video_urls is required.',
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
class YouTubeMasked(BaseModel):
|
|
366
|
+
"""
|
|
367
|
+
Optional credentials. Leave empty for public videos.
|
|
368
|
+
"""
|
|
369
|
+
|
|
370
|
+
model_config = ConfigDict(
|
|
371
|
+
extra='forbid',
|
|
372
|
+
)
|
|
373
|
+
cookies: str | None = Field(
|
|
374
|
+
None,
|
|
375
|
+
description='Netscape-format cookie file contents, used by yt-dlp to access age-restricted or members-only videos.',
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
class YouTubeOptionalTranscript(BaseModel):
|
|
380
|
+
"""
|
|
381
|
+
Transcript/caption fetching controls.
|
|
382
|
+
"""
|
|
383
|
+
|
|
384
|
+
model_config = ConfigDict(
|
|
385
|
+
extra='forbid',
|
|
386
|
+
)
|
|
387
|
+
languages: list[str] | None = Field(
|
|
388
|
+
None,
|
|
389
|
+
description='Preferred caption language codes in priority order (e.g. ["en"]). Empty means accept any available language.',
|
|
390
|
+
)
|
|
391
|
+
skip_transcript: bool | None = Field(
|
|
392
|
+
False,
|
|
393
|
+
description='When true, skip transcript fetching entirely (metadata-only assets, no detector content).',
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
class YouTubeOptionalConnection(BaseModel):
|
|
398
|
+
"""
|
|
399
|
+
Network controls for yt-dlp and transcript fetching.
|
|
400
|
+
"""
|
|
401
|
+
|
|
402
|
+
model_config = ConfigDict(
|
|
403
|
+
extra='forbid',
|
|
404
|
+
)
|
|
405
|
+
proxy_url: str | None = Field(
|
|
406
|
+
None,
|
|
407
|
+
description='Optional HTTP/HTTPS/SOCKS proxy URL to mitigate rate-limiting when scanning at scale.',
|
|
408
|
+
)
|
|
409
|
+
request_timeout_seconds: int | None = Field(
|
|
410
|
+
30, description='Socket timeout for yt-dlp network operations.', ge=1, le=300
|
|
411
|
+
)
|
|
412
|
+
ignore_errors: bool | None = Field(
|
|
413
|
+
True,
|
|
414
|
+
description='Continue past individual videos that fail to extract instead of aborting the run.',
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
class YouTubeOptional(BaseModel):
|
|
419
|
+
model_config = ConfigDict(
|
|
420
|
+
extra='forbid',
|
|
421
|
+
)
|
|
422
|
+
transcript: YouTubeOptionalTranscript | None = None
|
|
423
|
+
connection: YouTubeOptionalConnection | None = None
|
|
424
|
+
|
|
425
|
+
|
|
321
426
|
class SlackRequired(BaseModel):
|
|
322
427
|
model_config = ConfigDict(
|
|
323
428
|
extra='forbid',
|
|
@@ -1926,35 +2031,6 @@ class CoreInput(BaseModel):
|
|
|
1926
2031
|
resources: ResourceOverrides | None = None
|
|
1927
2032
|
|
|
1928
2033
|
|
|
1929
|
-
class Type(StrEnum):
|
|
1930
|
-
"""
|
|
1931
|
-
Type of the asset or source
|
|
1932
|
-
"""
|
|
1933
|
-
|
|
1934
|
-
WORDPRESS = 'WORDPRESS'
|
|
1935
|
-
SLACK = 'SLACK'
|
|
1936
|
-
S3_COMPATIBLE_STORAGE = 'S3_COMPATIBLE_STORAGE'
|
|
1937
|
-
AZURE_BLOB_STORAGE = 'AZURE_BLOB_STORAGE'
|
|
1938
|
-
GOOGLE_CLOUD_STORAGE = 'GOOGLE_CLOUD_STORAGE'
|
|
1939
|
-
POSTGRESQL = 'POSTGRESQL'
|
|
1940
|
-
MYSQL = 'MYSQL'
|
|
1941
|
-
MSSQL = 'MSSQL'
|
|
1942
|
-
ORACLE = 'ORACLE'
|
|
1943
|
-
HIVE = 'HIVE'
|
|
1944
|
-
DATABRICKS = 'DATABRICKS'
|
|
1945
|
-
SNOWFLAKE = 'SNOWFLAKE'
|
|
1946
|
-
MONGODB = 'MONGODB'
|
|
1947
|
-
NEO4J = 'NEO4J'
|
|
1948
|
-
POWERBI = 'POWERBI'
|
|
1949
|
-
TABLEAU = 'TABLEAU'
|
|
1950
|
-
CONFLUENCE = 'CONFLUENCE'
|
|
1951
|
-
JIRA = 'JIRA'
|
|
1952
|
-
SERVICEDESK = 'SERVICEDESK'
|
|
1953
|
-
SQLITE = 'SQLITE'
|
|
1954
|
-
NOTION = 'NOTION'
|
|
1955
|
-
EMAIL = 'EMAIL'
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
2034
|
class SlackInput(CoreInput):
|
|
1959
2035
|
type: Literal['SLACK'] = Field('SLACK', description='Type of the asset or source')
|
|
1960
2036
|
required: SlackRequired
|
|
@@ -2428,7 +2504,7 @@ class ConfluenceOptionalConnection(BaseModel):
|
|
|
2428
2504
|
)
|
|
2429
2505
|
|
|
2430
2506
|
|
|
2431
|
-
class
|
|
2507
|
+
class Type18(StrEnum):
|
|
2432
2508
|
"""
|
|
2433
2509
|
Filter spaces by space type
|
|
2434
2510
|
"""
|
|
@@ -2465,7 +2541,7 @@ class ConfluenceOptionalScopeSpaces(BaseModel):
|
|
|
2465
2541
|
keys: list[str] | None = Field(
|
|
2466
2542
|
None, description='Filter spaces by keys (up to 250)', max_length=250
|
|
2467
2543
|
)
|
|
2468
|
-
type:
|
|
2544
|
+
type: Type18 | None = Field(None, description='Filter spaces by space type')
|
|
2469
2545
|
status: Status | None = Field(None, description='Filter spaces by status')
|
|
2470
2546
|
labels: list[str] | None = Field(
|
|
2471
2547
|
None,
|
|
@@ -2731,7 +2807,7 @@ class ServiceDeskOptional(BaseModel):
|
|
|
2731
2807
|
content: ServiceDeskOptionalContent | None = None
|
|
2732
2808
|
|
|
2733
2809
|
|
|
2734
|
-
class
|
|
2810
|
+
class Type19(StrEnum):
|
|
2735
2811
|
"""
|
|
2736
2812
|
Type of the asset or source
|
|
2737
2813
|
"""
|
|
@@ -2758,6 +2834,7 @@ class Type18(StrEnum):
|
|
|
2758
2834
|
SQLITE = 'SQLITE'
|
|
2759
2835
|
NOTION = 'NOTION'
|
|
2760
2836
|
EMAIL = 'EMAIL'
|
|
2837
|
+
YOUTUBE = 'YOUTUBE'
|
|
2761
2838
|
|
|
2762
2839
|
|
|
2763
2840
|
class ConfluenceInput(CoreInput):
|
|
@@ -2995,6 +3072,24 @@ class NotionInput(CoreInput):
|
|
|
2995
3072
|
resources: ResourceOverrides | None = None
|
|
2996
3073
|
|
|
2997
3074
|
|
|
3075
|
+
class YouTubeInput(CoreInput):
|
|
3076
|
+
type: Literal['YOUTUBE'] = Field(
|
|
3077
|
+
'YOUTUBE', description='Type of the asset or source'
|
|
3078
|
+
)
|
|
3079
|
+
required: YouTubeRequired
|
|
3080
|
+
masked: YouTubeMasked | None = None
|
|
3081
|
+
optional: YouTubeOptional | None = None
|
|
3082
|
+
detectors: list[Detector] | None = Field(
|
|
3083
|
+
None, description='Detectors to run on ingested content'
|
|
3084
|
+
)
|
|
3085
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
3086
|
+
None,
|
|
3087
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
3088
|
+
)
|
|
3089
|
+
sampling: SamplingConfig
|
|
3090
|
+
resources: ResourceOverrides | None = None
|
|
3091
|
+
|
|
3092
|
+
|
|
2998
3093
|
class SourceInput(
|
|
2999
3094
|
RootModel[
|
|
3000
3095
|
SlackInput
|
|
@@ -3019,6 +3114,7 @@ class SourceInput(
|
|
|
3019
3114
|
| SQLiteInput
|
|
3020
3115
|
| NotionInput
|
|
3021
3116
|
| EmailInput
|
|
3117
|
+
| YouTubeInput
|
|
3022
3118
|
]
|
|
3023
3119
|
):
|
|
3024
3120
|
root: (
|
|
@@ -3044,6 +3140,7 @@ class SourceInput(
|
|
|
3044
3140
|
| SQLiteInput
|
|
3045
3141
|
| NotionInput
|
|
3046
3142
|
| EmailInput
|
|
3143
|
+
| YouTubeInput
|
|
3047
3144
|
) = Field(
|
|
3048
3145
|
...,
|
|
3049
3146
|
description='Merged configuration schema with all source types and common definitions',
|
|
@@ -214,6 +214,12 @@ class BaseSource(ABC):
|
|
|
214
214
|
sampling = getattr(config, "sampling", None) if config is not None else None
|
|
215
215
|
return bool(getattr(sampling, "enable_ocr", False))
|
|
216
216
|
|
|
217
|
+
def transcription_enabled(self) -> bool:
|
|
218
|
+
"""Return whether sampling-level audio/video transcription is enabled."""
|
|
219
|
+
config = getattr(self, "config", None)
|
|
220
|
+
sampling = getattr(config, "sampling", None) if config is not None else None
|
|
221
|
+
return bool(getattr(sampling, "enable_transcription", False))
|
|
222
|
+
|
|
217
223
|
def parse_asset_bytes(
|
|
218
224
|
self,
|
|
219
225
|
file_bytes: bytes,
|
|
@@ -228,6 +234,7 @@ class BaseSource(ABC):
|
|
|
228
234
|
declared_mime_type=declared_mime_type,
|
|
229
235
|
file_name=file_name,
|
|
230
236
|
enable_ocr=self.ocr_enabled(),
|
|
237
|
+
enable_transcription=self.transcription_enabled(),
|
|
231
238
|
)
|
|
232
239
|
|
|
233
240
|
def iter_asset_pages(
|
|
@@ -248,6 +255,7 @@ class BaseSource(ABC):
|
|
|
248
255
|
include_column_names,
|
|
249
256
|
file_name=file_name,
|
|
250
257
|
enable_ocr=self.ocr_enabled(),
|
|
258
|
+
enable_transcription=self.transcription_enabled(),
|
|
251
259
|
)
|
|
252
260
|
|
|
253
261
|
async def fetch_content_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
|
|
@@ -360,14 +360,16 @@ class ObjectStorageSourceBase(BaseSource, ABC):
|
|
|
360
360
|
)
|
|
361
361
|
normalized_mime = mime_type.split(";", 1)[0].strip().lower()
|
|
362
362
|
|
|
363
|
-
# Non-extractable types (images,
|
|
364
|
-
#
|
|
365
|
-
#
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
"
|
|
363
|
+
# Non-extractable types (images, opaque binary) carry no text. Audio/video
|
|
364
|
+
# are extractable only when transcription is enabled — otherwise they are
|
|
365
|
+
# treated as opaque binary. Everything else defers extraction to
|
|
366
|
+
# fetch_content_pages() so detectors receive content in configurable-sized
|
|
367
|
+
# pages instead of one monolithic blob.
|
|
368
|
+
is_media = normalized_mime.startswith(("audio/", "video/"))
|
|
369
|
+
is_non_extractable = (
|
|
370
|
+
normalized_mime.startswith("image/")
|
|
371
|
+
or (is_media and not self.transcription_enabled())
|
|
372
|
+
or normalized_mime in ("application/octet-stream", "application/zip")
|
|
371
373
|
)
|
|
372
374
|
|
|
373
375
|
return ContentSnapshot(
|