classifyre-cli 0.4.21__tar.gz → 0.4.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/.turbo/turbo-build.log +1 -1
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/PKG-INFO +1 -1
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/package.json +1 -1
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/pyproject.toml +1 -1
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/pipeline/detector_pipeline.py +7 -1
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/file_parser.py +12 -0
- classifyre_cli-0.4.23/src/utils/transcription.py +325 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/uv.lock +71 -75
- classifyre_cli-0.4.21/src/utils/transcription.py +0 -177
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/.gitignore +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/.python-version +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/README.md +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/main.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/scripts/generate_models.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/config.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/broken_links/detector.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.21/tests/detectors/threat → classifyre_cli-0.4.23/src/detectors/content}/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/extractor.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_llm.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_text_classification.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.21/tests/detectors/secrets → classifyre_cli-0.4.23/src/detectors/pii}/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.21/tests/detectors/pii → classifyre_cli-0.4.23/src/detectors/secrets}/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.21/tests/detectors/custom → classifyre_cli-0.4.23/src/detectors/threat}/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/main.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/models/generated_detectors.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/models/generated_input.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/models/generated_single_asset_scan_results.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/outputs/rest.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/pipeline/worker_pool.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/asset_metadata.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/base.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/confluence/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/databricks/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/email/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/email/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/hive/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/jira/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/mongodb/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/mssql/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/mysql/source.py +0 -0
- {classifyre_cli-0.4.21/tests/detectors/content → classifyre_cli-0.4.23/src/sources/neo4j}/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/neo4j/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/notion/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/notion/client.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/notion/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/object_storage/base.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/oracle/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/postgresql/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/powerbi/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/s3_compatible_storage/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/servicedesk/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/slack/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/snowflake/source.py +0 -0
- {classifyre_cli-0.4.21/tests/detectors → classifyre_cli-0.4.23/src/sources/sqlite}/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/sqlite/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/tableau/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/tabular_base.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/tabular_utils.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/wordpress/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/youtube/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/youtube/source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/dependency_groups.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/embedded_images.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/file_metadata.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/file_to_images.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.21/src/sources/sqlite → classifyre_cli-0.4.23/tests/detectors}/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.21/src/sources/neo4j → classifyre_cli-0.4.23/tests/detectors/content}/__init__.py +0 -0
- {classifyre_cli-0.4.21/src/detectors/threat → classifyre_cli-0.4.23/tests/detectors/custom}/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/custom/test_llm_runner.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.21/src/detectors/secrets → classifyre_cli-0.4.23/tests/detectors/pii}/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.21/src/detectors/pii → classifyre_cli-0.4.23/tests/detectors/secrets}/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.21/src/detectors/content → classifyre_cli-0.4.23/tests/detectors/threat}/__init__.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/pipeline/test_detector_pipeline.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/pipeline/test_worker_pool.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_assets_metadata_catalog.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_config.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_custom_extractor.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_databricks_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_dependency_groups.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_email_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_hive_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_mssql_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_mysql_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_notion_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_oracle_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_outputs.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_postgresql_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_s3_compatible_storage_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_sandbox_runner.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_snowflake_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_source_dependency_groups.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_sqlite_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_uv_sync.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_youtube_source.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_youtube_source_integration.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/utils/test_embedded_images.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/utils/test_file_metadata.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/utils/test_file_parser.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/utils/test_file_to_images.py +0 -0
- {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/utils/test_transcription.py +0 -0
|
@@ -109,7 +109,10 @@ class DetectorPipeline:
|
|
|
109
109
|
|
|
110
110
|
scan_started = datetime.now(UTC)
|
|
111
111
|
ocr_enabled = self.source.ocr_enabled()
|
|
112
|
-
|
|
112
|
+
transcription_enabled = self.source.transcription_enabled()
|
|
113
|
+
text_content_type = self._text_content_type_for_asset(
|
|
114
|
+
asset.asset_type, ocr_enabled, transcription_enabled
|
|
115
|
+
)
|
|
113
116
|
link_content = self._build_links_payload(asset.links)
|
|
114
117
|
|
|
115
118
|
text_detectors = []
|
|
@@ -727,6 +730,7 @@ class DetectorPipeline:
|
|
|
727
730
|
self,
|
|
728
731
|
asset_type: OutputAssetType,
|
|
729
732
|
ocr_enabled: bool,
|
|
733
|
+
transcription_enabled: bool = False,
|
|
730
734
|
) -> str | None:
|
|
731
735
|
mapping = {
|
|
732
736
|
OutputAssetType.TXT: "text/plain",
|
|
@@ -737,6 +741,8 @@ class DetectorPipeline:
|
|
|
737
741
|
return mapping[asset_type]
|
|
738
742
|
if ocr_enabled and asset_type in {OutputAssetType.IMAGE, OutputAssetType.BINARY}:
|
|
739
743
|
return "text/plain"
|
|
744
|
+
if transcription_enabled and asset_type in {OutputAssetType.AUDIO, OutputAssetType.VIDEO}:
|
|
745
|
+
return "text/plain"
|
|
740
746
|
return None
|
|
741
747
|
|
|
742
748
|
@staticmethod
|
|
@@ -690,6 +690,18 @@ def iter_file_pages(
|
|
|
690
690
|
yield from _iter_parquet_pages(file_bytes, batch_size, include_column_names)
|
|
691
691
|
elif normalized in ("text/csv", "text/tab-separated-values"):
|
|
692
692
|
yield from _iter_csv_pages(file_bytes, include_column_names)
|
|
693
|
+
elif normalized.startswith(("audio/", "video/")) and enable_transcription:
|
|
694
|
+
# Stream transcript pages directly from the chunked transcription pipeline
|
|
695
|
+
# so the detector receives text as each ~10-min audio chunk completes
|
|
696
|
+
# instead of waiting for the full file and buffering the entire transcript.
|
|
697
|
+
from .transcription import iter_transcription_pages
|
|
698
|
+
|
|
699
|
+
yield from iter_transcription_pages(
|
|
700
|
+
file_bytes,
|
|
701
|
+
mime_type=normalized,
|
|
702
|
+
file_name=file_name,
|
|
703
|
+
segments_per_page=batch_size,
|
|
704
|
+
)
|
|
693
705
|
else:
|
|
694
706
|
text, error = extract_text(
|
|
695
707
|
file_bytes,
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
"""Audio/video transcription via faster-whisper (CPU-only by default).
|
|
2
|
+
|
|
3
|
+
Mirrors the lazy, thread-safe singleton pattern used for the Docling converter
|
|
4
|
+
in ``file_parser.py``: building a WhisperModel loads model weights (~1.5 GB for
|
|
5
|
+
``medium``) so it happens exactly once per process, and a semaphore caps
|
|
6
|
+
concurrent inference to avoid OOM under the worker thread pool.
|
|
7
|
+
|
|
8
|
+
Long audio files are split into ~10-minute WAV chunks using PyAV (bundled with
|
|
9
|
+
faster-whisper) before transcription. This bounds the per-chunk decoded audio
|
|
10
|
+
buffer to ~38 MB instead of the ~230 MB required for a full 1-hour file, making
|
|
11
|
+
the overall peak memory manageable alongside the 1.5 GB model weights.
|
|
12
|
+
|
|
13
|
+
Transcription is opt-in (per-source ``sampling.enable_transcription``); callers
|
|
14
|
+
treat a returned error the same way they treat any other parse failure.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import io
|
|
20
|
+
import logging
|
|
21
|
+
import tempfile
|
|
22
|
+
import wave
|
|
23
|
+
from collections.abc import Generator
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from threading import Lock, Semaphore
|
|
26
|
+
from urllib.parse import urlsplit
|
|
27
|
+
|
|
28
|
+
from ..config import get_whisper_config
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
# Map a normalized media MIME type to a temp-file extension faster-whisper /
|
|
33
|
+
# PyAV can demux. Extension is only a hint for the demuxer; PyAV sniffs the
|
|
34
|
+
# container regardless, so an imperfect guess still decodes.
|
|
35
|
+
_MIME_EXTENSION_HINTS = {
|
|
36
|
+
"audio/mpeg": ".mp3",
|
|
37
|
+
"audio/mp3": ".mp3",
|
|
38
|
+
"audio/wav": ".wav",
|
|
39
|
+
"audio/x-wav": ".wav",
|
|
40
|
+
"audio/mp4": ".m4a",
|
|
41
|
+
"audio/aac": ".aac",
|
|
42
|
+
"audio/ogg": ".ogg",
|
|
43
|
+
"audio/opus": ".opus",
|
|
44
|
+
"audio/flac": ".flac",
|
|
45
|
+
"audio/x-flac": ".flac",
|
|
46
|
+
"video/mp4": ".mp4",
|
|
47
|
+
"video/x-matroska": ".mkv",
|
|
48
|
+
"video/quicktime": ".mov",
|
|
49
|
+
"video/webm": ".webm",
|
|
50
|
+
"video/x-msvideo": ".avi",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class _WhisperState:
|
|
55
|
+
"""Mutable singleton state for the WhisperModel (see _DoclingState rationale)."""
|
|
56
|
+
|
|
57
|
+
def __init__(self) -> None:
|
|
58
|
+
self.model: object = None
|
|
59
|
+
self.error: str | None = None
|
|
60
|
+
self.attempted: bool = False
|
|
61
|
+
# Allow one retry when the failure is a transient dependency install
|
|
62
|
+
# (network blip / registry timeout). A genuinely broken package fails on
|
|
63
|
+
# the retry too and is then cached permanently.
|
|
64
|
+
self.install_retry_remaining: int = 1
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
_whisper_state = _WhisperState()
|
|
68
|
+
_whisper_lock = Lock()
|
|
69
|
+
# A single medium model already holds ~1.5 GB; serialise inference so two
|
|
70
|
+
# concurrent transcriptions cannot push the worker over its memory limit.
|
|
71
|
+
_whisper_inference_sem = Semaphore(1)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _get_whisper_model() -> tuple[object, str | None]:
|
|
75
|
+
"""Return a cached WhisperModel, initializing it on the first call."""
|
|
76
|
+
if _whisper_state.model is not None or _whisper_state.error is not None:
|
|
77
|
+
return _whisper_state.model, _whisper_state.error
|
|
78
|
+
with _whisper_lock:
|
|
79
|
+
if _whisper_state.attempted:
|
|
80
|
+
return _whisper_state.model, _whisper_state.error
|
|
81
|
+
_whisper_state.attempted = True
|
|
82
|
+
try:
|
|
83
|
+
from ..sources.dependencies import require_module
|
|
84
|
+
|
|
85
|
+
whisper_module = require_module(
|
|
86
|
+
"faster_whisper",
|
|
87
|
+
"audio/video transcription",
|
|
88
|
+
["transcription"],
|
|
89
|
+
detail="Transcription requires the faster-whisper optional dependency.",
|
|
90
|
+
)
|
|
91
|
+
cfg = get_whisper_config()
|
|
92
|
+
_whisper_state.model = whisper_module.WhisperModel(
|
|
93
|
+
cfg.model,
|
|
94
|
+
device=cfg.device,
|
|
95
|
+
compute_type=cfg.compute_type,
|
|
96
|
+
)
|
|
97
|
+
logger.info(
|
|
98
|
+
"Loaded faster-whisper model %s (device=%s, compute_type=%s)",
|
|
99
|
+
cfg.model,
|
|
100
|
+
cfg.device,
|
|
101
|
+
cfg.compute_type,
|
|
102
|
+
)
|
|
103
|
+
except Exception as exc:
|
|
104
|
+
from ..sources.dependencies import MissingSourceDependencyError
|
|
105
|
+
|
|
106
|
+
if (
|
|
107
|
+
isinstance(exc, MissingSourceDependencyError)
|
|
108
|
+
and _whisper_state.install_retry_remaining > 0
|
|
109
|
+
):
|
|
110
|
+
_whisper_state.install_retry_remaining -= 1
|
|
111
|
+
_whisper_state.attempted = False
|
|
112
|
+
logger.warning(
|
|
113
|
+
"Transcription dependency install failed (may be transient); "
|
|
114
|
+
"will retry once: %s",
|
|
115
|
+
exc,
|
|
116
|
+
)
|
|
117
|
+
else:
|
|
118
|
+
_whisper_state.error = str(exc)
|
|
119
|
+
return _whisper_state.model, _whisper_state.error
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _reset_whisper_singleton() -> None:
|
|
123
|
+
"""Reset the cached WhisperModel. Intended for test isolation only."""
|
|
124
|
+
with _whisper_lock:
|
|
125
|
+
_whisper_state.model = None
|
|
126
|
+
_whisper_state.error = None
|
|
127
|
+
_whisper_state.attempted = False
|
|
128
|
+
_whisper_state.install_retry_remaining = 1
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _temp_suffix(file_name: str, mime_type: str) -> str:
|
|
132
|
+
if file_name:
|
|
133
|
+
path = urlsplit(file_name).path or file_name
|
|
134
|
+
suffix = Path(path).suffix.lower()
|
|
135
|
+
if suffix:
|
|
136
|
+
return suffix
|
|
137
|
+
normalized = mime_type.split(";", 1)[0].strip().lower()
|
|
138
|
+
return _MIME_EXTENSION_HINTS.get(normalized, ".bin")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _pcm_to_wav(pcm_bytes: bytes, sample_rate: int) -> bytes:
|
|
142
|
+
"""Wrap raw int16 mono PCM bytes in a WAV container."""
|
|
143
|
+
buf = io.BytesIO()
|
|
144
|
+
with wave.open(buf, "wb") as wf:
|
|
145
|
+
wf.setnchannels(1)
|
|
146
|
+
wf.setsampwidth(2)
|
|
147
|
+
wf.setframerate(sample_rate)
|
|
148
|
+
wf.writeframes(pcm_bytes)
|
|
149
|
+
return buf.getvalue()
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
_AUDIO_CHUNK_SECONDS = 600 # 10-minute chunks → ~38 MB decoded audio per chunk
|
|
153
|
+
_TARGET_SAMPLE_RATE = 16_000
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _split_audio_chunks(
|
|
157
|
+
file_bytes: bytes,
|
|
158
|
+
chunk_seconds: int = _AUDIO_CHUNK_SECONDS,
|
|
159
|
+
) -> Generator[bytes, None, None]:
|
|
160
|
+
"""Decode audio bytes and yield WAV chunks via PyAV (bundled with faster-whisper).
|
|
161
|
+
|
|
162
|
+
Streams through the compressed audio frame-by-frame so only
|
|
163
|
+
``chunk_seconds`` worth of decoded PCM is held in memory at once instead of
|
|
164
|
+
the full decoded duration. Falls back to yielding the original bytes when
|
|
165
|
+
PyAV is unavailable or decoding fails.
|
|
166
|
+
"""
|
|
167
|
+
try:
|
|
168
|
+
import av as pyav # type: ignore[import-untyped]
|
|
169
|
+
except ImportError:
|
|
170
|
+
yield file_bytes
|
|
171
|
+
return
|
|
172
|
+
|
|
173
|
+
bytes_per_chunk = _TARGET_SAMPLE_RATE * chunk_seconds * 2 # int16 = 2 bytes/sample
|
|
174
|
+
current: bytearray = bytearray()
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
container = pyav.open(io.BytesIO(file_bytes), metadata_errors="ignore")
|
|
178
|
+
audio_streams = [s for s in container.streams if s.type == "audio"]
|
|
179
|
+
if not audio_streams:
|
|
180
|
+
yield file_bytes
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
resampler = pyav.audio.resampler.AudioResampler(
|
|
184
|
+
format="s16", layout="mono", rate=_TARGET_SAMPLE_RATE
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def _drain(frames: object) -> Generator[bytes, None, None]:
|
|
188
|
+
result = (
|
|
189
|
+
frames if isinstance(frames, list) else ([frames] if frames is not None else [])
|
|
190
|
+
)
|
|
191
|
+
for out_frame in result:
|
|
192
|
+
current.extend(bytes(out_frame.planes[0]))
|
|
193
|
+
while len(current) >= bytes_per_chunk:
|
|
194
|
+
yield _pcm_to_wav(bytes(current[:bytes_per_chunk]), _TARGET_SAMPLE_RATE)
|
|
195
|
+
del current[:bytes_per_chunk]
|
|
196
|
+
|
|
197
|
+
for frame in container.decode(audio_streams[0]):
|
|
198
|
+
yield from _drain(resampler.resample(frame))
|
|
199
|
+
|
|
200
|
+
# Flush the resampler's internal buffer.
|
|
201
|
+
try:
|
|
202
|
+
yield from _drain(resampler.resample(None))
|
|
203
|
+
except Exception:
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
if current:
|
|
207
|
+
yield _pcm_to_wav(bytes(current), _TARGET_SAMPLE_RATE)
|
|
208
|
+
|
|
209
|
+
except Exception as exc:
|
|
210
|
+
logger.warning(
|
|
211
|
+
"Audio chunking failed (%s); falling back to full-file transcription: %s",
|
|
212
|
+
type(exc).__name__,
|
|
213
|
+
exc,
|
|
214
|
+
)
|
|
215
|
+
yield file_bytes
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def iter_transcription_pages(
|
|
219
|
+
file_bytes: bytes,
|
|
220
|
+
*,
|
|
221
|
+
mime_type: str,
|
|
222
|
+
file_name: str = "",
|
|
223
|
+
segments_per_page: int = 50,
|
|
224
|
+
chunk_seconds: int = _AUDIO_CHUNK_SECONDS,
|
|
225
|
+
) -> Generator[str, None, None]:
|
|
226
|
+
"""Transcribe audio/video in chunks, yielding pages of transcript text.
|
|
227
|
+
|
|
228
|
+
Splits long audio into ``chunk_seconds``-long WAV chunks and transcribes
|
|
229
|
+
each under the inference semaphore, then yields batches of
|
|
230
|
+
``segments_per_page`` whisper segments as each chunk completes. This lets
|
|
231
|
+
the detector start receiving text immediately and keeps peak decoded-audio
|
|
232
|
+
memory bounded to one chunk at a time.
|
|
233
|
+
"""
|
|
234
|
+
if not file_bytes:
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
model, error = _get_whisper_model()
|
|
238
|
+
if error:
|
|
239
|
+
logger.warning("Whisper model unavailable for %s: %s", file_name or mime_type, error)
|
|
240
|
+
return
|
|
241
|
+
if model is None:
|
|
242
|
+
logger.warning("Whisper model not initialized for %s", file_name or mime_type)
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
cfg = get_whisper_config()
|
|
246
|
+
suffix = _temp_suffix(file_name, mime_type)
|
|
247
|
+
|
|
248
|
+
for chunk_index, chunk_bytes in enumerate(_split_audio_chunks(file_bytes, chunk_seconds), 1):
|
|
249
|
+
is_wav = chunk_bytes[:4] == b"RIFF"
|
|
250
|
+
chunk_suffix = ".wav" if is_wav else suffix
|
|
251
|
+
try:
|
|
252
|
+
with tempfile.TemporaryDirectory(prefix="classifyre-whisper-") as temp_dir:
|
|
253
|
+
temp_path = Path(temp_dir) / f"chunk{chunk_suffix}"
|
|
254
|
+
temp_path.write_bytes(chunk_bytes)
|
|
255
|
+
with _whisper_inference_sem:
|
|
256
|
+
segments, _info = model.transcribe( # type: ignore[attr-defined]
|
|
257
|
+
str(temp_path),
|
|
258
|
+
beam_size=cfg.beam_size,
|
|
259
|
+
vad_filter=cfg.vad_filter,
|
|
260
|
+
word_timestamps=cfg.word_timestamps,
|
|
261
|
+
)
|
|
262
|
+
page: list[str] = []
|
|
263
|
+
total_chars = 0
|
|
264
|
+
for segment in segments:
|
|
265
|
+
text = segment.text.strip()
|
|
266
|
+
if text:
|
|
267
|
+
page.append(text)
|
|
268
|
+
total_chars += len(text)
|
|
269
|
+
if len(page) >= segments_per_page:
|
|
270
|
+
yield "\n".join(page)
|
|
271
|
+
page = []
|
|
272
|
+
if page:
|
|
273
|
+
yield "\n".join(page)
|
|
274
|
+
logger.info(
|
|
275
|
+
"Transcribed chunk %d: %d chars from %s (%s)",
|
|
276
|
+
chunk_index,
|
|
277
|
+
total_chars,
|
|
278
|
+
file_name or mime_type,
|
|
279
|
+
mime_type,
|
|
280
|
+
)
|
|
281
|
+
except Exception as exc:
|
|
282
|
+
logger.warning(
|
|
283
|
+
"Transcription failed for chunk %d of %s: %s",
|
|
284
|
+
chunk_index,
|
|
285
|
+
file_name or mime_type,
|
|
286
|
+
exc,
|
|
287
|
+
)
|
|
288
|
+
raise
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def transcribe_media(
|
|
292
|
+
file_bytes: bytes,
|
|
293
|
+
*,
|
|
294
|
+
mime_type: str,
|
|
295
|
+
file_name: str = "",
|
|
296
|
+
) -> tuple[str, str | None]:
|
|
297
|
+
"""Transcribe audio/video bytes to text (full transcript returned at once).
|
|
298
|
+
|
|
299
|
+
Prefer ``iter_transcription_pages`` when processing long files; this
|
|
300
|
+
function buffers the entire transcript before returning.
|
|
301
|
+
"""
|
|
302
|
+
if not file_bytes:
|
|
303
|
+
return "", None
|
|
304
|
+
|
|
305
|
+
model, model_error = _get_whisper_model()
|
|
306
|
+
if model_error:
|
|
307
|
+
return "", model_error
|
|
308
|
+
if model is None:
|
|
309
|
+
return "", "Whisper model not initialized"
|
|
310
|
+
|
|
311
|
+
try:
|
|
312
|
+
pages = list(iter_transcription_pages(file_bytes, mime_type=mime_type, file_name=file_name))
|
|
313
|
+
except Exception as exc:
|
|
314
|
+
logger.warning("Transcription failed for %s: %s", file_name or mime_type, exc)
|
|
315
|
+
return "", f"Transcription failed: {exc}"
|
|
316
|
+
|
|
317
|
+
text = "\n".join(pages)
|
|
318
|
+
if text:
|
|
319
|
+
logger.info(
|
|
320
|
+
"Transcribed %d chars from %s (%s)",
|
|
321
|
+
len(text),
|
|
322
|
+
file_name or mime_type,
|
|
323
|
+
mime_type,
|
|
324
|
+
)
|
|
325
|
+
return text, None
|