classifyre-cli 0.4.21__tar.gz → 0.4.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/.turbo/turbo-build.log +1 -1
  2. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/package.json +1 -1
  4. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/pyproject.toml +1 -1
  5. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/pipeline/detector_pipeline.py +7 -1
  6. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/file_parser.py +12 -0
  7. classifyre_cli-0.4.23/src/utils/transcription.py +325 -0
  8. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/uv.lock +71 -75
  9. classifyre_cli-0.4.21/src/utils/transcription.py +0 -177
  10. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/.gitignore +0 -0
  11. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/.python-version +0 -0
  12. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/README.md +0 -0
  13. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/main.py +0 -0
  14. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/scripts/generate_models.py +0 -0
  15. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/__init__.py +0 -0
  16. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/config.py +0 -0
  17. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/__init__.py +0 -0
  18. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/base.py +0 -0
  19. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/broken_links/__init__.py +0 -0
  20. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/broken_links/detector.py +0 -0
  21. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/config.py +0 -0
  22. {classifyre_cli-0.4.21/tests/detectors/threat → classifyre_cli-0.4.23/src/detectors/content}/__init__.py +0 -0
  23. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/__init__.py +0 -0
  24. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/detector.py +0 -0
  25. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/extractor.py +0 -0
  26. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/__init__.py +0 -0
  27. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_base.py +0 -0
  28. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_factory.py +0 -0
  29. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  30. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_gliner2.py +0 -0
  31. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_image_classification.py +0 -0
  32. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_llm.py +0 -0
  33. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_object_detection.py +0 -0
  34. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_regex.py +0 -0
  35. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/runners/_text_classification.py +0 -0
  36. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/custom/trainer.py +0 -0
  37. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/dependencies.py +0 -0
  38. {classifyre_cli-0.4.21/tests/detectors/secrets → classifyre_cli-0.4.23/src/detectors/pii}/__init__.py +0 -0
  39. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/pii/detector.py +0 -0
  40. {classifyre_cli-0.4.21/tests/detectors/pii → classifyre_cli-0.4.23/src/detectors/secrets}/__init__.py +0 -0
  41. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/secrets/detector.py +0 -0
  42. {classifyre_cli-0.4.21/tests/detectors/custom → classifyre_cli-0.4.23/src/detectors/threat}/__init__.py +0 -0
  43. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/threat/code_security_detector.py +0 -0
  44. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/detectors/threat/yara_detector.py +0 -0
  45. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/main.py +0 -0
  46. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/models/generated_detectors.py +0 -0
  47. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/models/generated_input.py +0 -0
  48. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/models/generated_single_asset_scan_results.py +0 -0
  49. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/outputs/__init__.py +0 -0
  50. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/outputs/base.py +0 -0
  51. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/outputs/console.py +0 -0
  52. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/outputs/factory.py +0 -0
  53. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/outputs/file.py +0 -0
  54. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/outputs/rest.py +0 -0
  55. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/pipeline/__init__.py +0 -0
  56. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/pipeline/content_provider.py +0 -0
  57. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/pipeline/parsed_content_provider.py +0 -0
  58. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/pipeline/worker_pool.py +0 -0
  59. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sandbox/__init__.py +0 -0
  60. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sandbox/runner.py +0 -0
  61. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/__init__.py +0 -0
  62. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/asset_metadata.py +0 -0
  63. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/atlassian_common.py +0 -0
  64. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/azure_blob_storage/__init__.py +0 -0
  65. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/azure_blob_storage/source.py +0 -0
  66. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/base.py +0 -0
  67. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/confluence/__init__.py +0 -0
  68. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/confluence/source.py +0 -0
  69. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/databricks/__init__.py +0 -0
  70. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/databricks/source.py +0 -0
  71. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/dependencies.py +0 -0
  72. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/email/__init__.py +0 -0
  73. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/email/source.py +0 -0
  74. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/google_cloud_storage/__init__.py +0 -0
  75. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/google_cloud_storage/source.py +0 -0
  76. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/hive/__init__.py +0 -0
  77. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/hive/source.py +0 -0
  78. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/jira/__init__.py +0 -0
  79. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/jira/source.py +0 -0
  80. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/mongodb/__init__.py +0 -0
  81. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/mongodb/source.py +0 -0
  82. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/mssql/__init__.py +0 -0
  83. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/mssql/source.py +0 -0
  84. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/mysql/__init__.py +0 -0
  85. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/mysql/source.py +0 -0
  86. {classifyre_cli-0.4.21/tests/detectors/content → classifyre_cli-0.4.23/src/sources/neo4j}/__init__.py +0 -0
  87. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/neo4j/source.py +0 -0
  88. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/notion/__init__.py +0 -0
  89. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/notion/client.py +0 -0
  90. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/notion/source.py +0 -0
  91. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/object_storage/base.py +0 -0
  92. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/oracle/__init__.py +0 -0
  93. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/oracle/source.py +0 -0
  94. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/postgresql/__init__.py +0 -0
  95. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/postgresql/source.py +0 -0
  96. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/powerbi/__init__.py +0 -0
  97. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/powerbi/source.py +0 -0
  98. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/recipe_normalizer.py +0 -0
  99. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/s3_compatible_storage/README.md +0 -0
  100. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/s3_compatible_storage/__init__.py +0 -0
  101. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/s3_compatible_storage/source.py +0 -0
  102. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/servicedesk/__init__.py +0 -0
  103. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/servicedesk/source.py +0 -0
  104. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/slack/__init__.py +0 -0
  105. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/slack/source.py +0 -0
  106. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/snowflake/__init__.py +0 -0
  107. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/snowflake/source.py +0 -0
  108. {classifyre_cli-0.4.21/tests/detectors → classifyre_cli-0.4.23/src/sources/sqlite}/__init__.py +0 -0
  109. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/sqlite/source.py +0 -0
  110. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/tableau/__init__.py +0 -0
  111. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/tableau/source.py +0 -0
  112. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/tabular_base.py +0 -0
  113. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/tabular_utils.py +0 -0
  114. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/wordpress/__init__.py +0 -0
  115. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/wordpress/source.py +0 -0
  116. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/youtube/__init__.py +0 -0
  117. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/sources/youtube/source.py +0 -0
  118. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/telemetry.py +0 -0
  119. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/__init__.py +0 -0
  120. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/content_extraction.py +0 -0
  121. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/dependency_groups.py +0 -0
  122. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/embedded_images.py +0 -0
  123. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/file_metadata.py +0 -0
  124. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/file_to_images.py +0 -0
  125. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/hashing.py +0 -0
  126. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/uv_sync.py +0 -0
  127. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/src/utils/validation.py +0 -0
  128. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/__init__.py +0 -0
  129. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/conftest.py +0 -0
  130. {classifyre_cli-0.4.21/src/sources/sqlite → classifyre_cli-0.4.23/tests/detectors}/__init__.py +0 -0
  131. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  132. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/conftest.py +0 -0
  133. {classifyre_cli-0.4.21/src/sources/neo4j → classifyre_cli-0.4.23/tests/detectors/content}/__init__.py +0 -0
  134. {classifyre_cli-0.4.21/src/detectors/threat → classifyre_cli-0.4.23/tests/detectors/custom}/__init__.py +0 -0
  135. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/custom/conftest.py +0 -0
  136. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  137. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/custom/test_llm_runner.py +0 -0
  138. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  139. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/custom/test_regex_runner.py +0 -0
  140. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/custom/test_transformer_runners.py +0 -0
  141. {classifyre_cli-0.4.21/src/detectors/secrets → classifyre_cli-0.4.23/tests/detectors/pii}/__init__.py +0 -0
  142. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/pii/conftest.py +0 -0
  143. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/pii/sample_invoice.pdf +0 -0
  144. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/pii/test_pii_detector.py +0 -0
  145. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  146. {classifyre_cli-0.4.21/src/detectors/pii → classifyre_cli-0.4.23/tests/detectors/secrets}/__init__.py +0 -0
  147. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  148. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  149. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_base_detector.py +0 -0
  150. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  151. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  152. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_detector_pipeline_types.py +0 -0
  153. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_detector_schema_examples.py +0 -0
  154. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_detector_types.py +0 -0
  155. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_phase2_detectors.py +0 -0
  156. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/test_registry.py +0 -0
  157. {classifyre_cli-0.4.21/src/detectors/content → classifyre_cli-0.4.23/tests/detectors/threat}/__init__.py +0 -0
  158. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/threat/test_code_security_detector.py +0 -0
  159. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/detectors/threat/test_yara_detector.py +0 -0
  160. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  161. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/integration/test_wordpress_links_assets.py +0 -0
  162. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/pipeline/test_detector_pipeline.py +0 -0
  163. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/pipeline/test_worker_pool.py +0 -0
  164. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_assets_metadata_catalog.py +0 -0
  165. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_azure_blob_storage_source.py +0 -0
  166. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_base_source_attachment.py +0 -0
  167. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_base_source_sampling.py +0 -0
  168. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_config.py +0 -0
  169. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_confluence_source.py +0 -0
  170. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_custom_extractor.py +0 -0
  171. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_databricks_source.py +0 -0
  172. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_dependency_groups.py +0 -0
  173. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_email_source.py +0 -0
  174. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_google_cloud_storage_source.py +0 -0
  175. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_hashing.py +0 -0
  176. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_hive_source.py +0 -0
  177. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_jira_source.py +0 -0
  178. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_mongodb_source.py +0 -0
  179. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_mssql_source.py +0 -0
  180. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_mysql_source.py +0 -0
  181. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_neo4j_source.py +0 -0
  182. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_notion_source.py +0 -0
  183. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_oracle_source.py +0 -0
  184. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_outputs.py +0 -0
  185. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_postgresql_source.py +0 -0
  186. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_powerbi_source.py +0 -0
  187. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_recipe_normalizer.py +0 -0
  188. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_s3_compatible_storage_source.py +0 -0
  189. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_sandbox_runner.py +0 -0
  190. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_servicedesk_source.py +0 -0
  191. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_slack_source.py +0 -0
  192. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_snowflake_source.py +0 -0
  193. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_source_dependency_groups.py +0 -0
  194. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_sqlite_source.py +0 -0
  195. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_tableau_source.py +0 -0
  196. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_tabular_utils.py +0 -0
  197. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_uv_sync.py +0 -0
  198. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_wordpress_source.py +0 -0
  199. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_youtube_source.py +0 -0
  200. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/test_youtube_source_integration.py +0 -0
  201. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/utils/test_content_extraction.py +0 -0
  202. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/utils/test_embedded_images.py +0 -0
  203. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/utils/test_file_metadata.py +0 -0
  204. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/utils/test_file_parser.py +0 -0
  205. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/utils/test_file_to_images.py +0 -0
  206. {classifyre_cli-0.4.21 → classifyre_cli-0.4.23}/tests/utils/test_transcription.py +0 -0
@@ -1,3 +1,3 @@
1
1
  $ uv sync
2
- Resolved 262 packages in 10ms
2
+ Resolved 262 packages in 370ms
3
3
  Checked 50 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.21
3
+ Version: 0.4.23
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.21",
3
+ "version": "0.4.23",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.21"
3
+ version = "0.4.23"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -109,7 +109,10 @@ class DetectorPipeline:
109
109
 
110
110
  scan_started = datetime.now(UTC)
111
111
  ocr_enabled = self.source.ocr_enabled()
112
- text_content_type = self._text_content_type_for_asset(asset.asset_type, ocr_enabled)
112
+ transcription_enabled = self.source.transcription_enabled()
113
+ text_content_type = self._text_content_type_for_asset(
114
+ asset.asset_type, ocr_enabled, transcription_enabled
115
+ )
113
116
  link_content = self._build_links_payload(asset.links)
114
117
 
115
118
  text_detectors = []
@@ -727,6 +730,7 @@ class DetectorPipeline:
727
730
  self,
728
731
  asset_type: OutputAssetType,
729
732
  ocr_enabled: bool,
733
+ transcription_enabled: bool = False,
730
734
  ) -> str | None:
731
735
  mapping = {
732
736
  OutputAssetType.TXT: "text/plain",
@@ -737,6 +741,8 @@ class DetectorPipeline:
737
741
  return mapping[asset_type]
738
742
  if ocr_enabled and asset_type in {OutputAssetType.IMAGE, OutputAssetType.BINARY}:
739
743
  return "text/plain"
744
+ if transcription_enabled and asset_type in {OutputAssetType.AUDIO, OutputAssetType.VIDEO}:
745
+ return "text/plain"
740
746
  return None
741
747
 
742
748
  @staticmethod
@@ -690,6 +690,18 @@ def iter_file_pages(
690
690
  yield from _iter_parquet_pages(file_bytes, batch_size, include_column_names)
691
691
  elif normalized in ("text/csv", "text/tab-separated-values"):
692
692
  yield from _iter_csv_pages(file_bytes, include_column_names)
693
+ elif normalized.startswith(("audio/", "video/")) and enable_transcription:
694
+ # Stream transcript pages directly from the chunked transcription pipeline
695
+ # so the detector receives text as each ~10-min audio chunk completes
696
+ # instead of waiting for the full file and buffering the entire transcript.
697
+ from .transcription import iter_transcription_pages
698
+
699
+ yield from iter_transcription_pages(
700
+ file_bytes,
701
+ mime_type=normalized,
702
+ file_name=file_name,
703
+ segments_per_page=batch_size,
704
+ )
693
705
  else:
694
706
  text, error = extract_text(
695
707
  file_bytes,
@@ -0,0 +1,325 @@
1
+ """Audio/video transcription via faster-whisper (CPU-only by default).
2
+
3
+ Mirrors the lazy, thread-safe singleton pattern used for the Docling converter
4
+ in ``file_parser.py``: building a WhisperModel loads model weights (~1.5 GB for
5
+ ``medium``) so it happens exactly once per process, and a semaphore caps
6
+ concurrent inference to avoid OOM under the worker thread pool.
7
+
8
+ Long audio files are split into ~10-minute WAV chunks using PyAV (bundled with
9
+ faster-whisper) before transcription. This bounds the per-chunk decoded audio
10
+ buffer to ~38 MB instead of the ~230 MB required for a full 1-hour file, making
11
+ the overall peak memory manageable alongside the 1.5 GB model weights.
12
+
13
+ Transcription is opt-in (per-source ``sampling.enable_transcription``); callers
14
+ treat a returned error the same way they treat any other parse failure.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import io
20
+ import logging
21
+ import tempfile
22
+ import wave
23
+ from collections.abc import Generator
24
+ from pathlib import Path
25
+ from threading import Lock, Semaphore
26
+ from urllib.parse import urlsplit
27
+
28
+ from ..config import get_whisper_config
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Map a normalized media MIME type to a temp-file extension faster-whisper /
33
+ # PyAV can demux. Extension is only a hint for the demuxer; PyAV sniffs the
34
+ # container regardless, so an imperfect guess still decodes.
35
+ _MIME_EXTENSION_HINTS = {
36
+ "audio/mpeg": ".mp3",
37
+ "audio/mp3": ".mp3",
38
+ "audio/wav": ".wav",
39
+ "audio/x-wav": ".wav",
40
+ "audio/mp4": ".m4a",
41
+ "audio/aac": ".aac",
42
+ "audio/ogg": ".ogg",
43
+ "audio/opus": ".opus",
44
+ "audio/flac": ".flac",
45
+ "audio/x-flac": ".flac",
46
+ "video/mp4": ".mp4",
47
+ "video/x-matroska": ".mkv",
48
+ "video/quicktime": ".mov",
49
+ "video/webm": ".webm",
50
+ "video/x-msvideo": ".avi",
51
+ }
52
+
53
+
54
+ class _WhisperState:
55
+ """Mutable singleton state for the WhisperModel (see _DoclingState rationale)."""
56
+
57
+ def __init__(self) -> None:
58
+ self.model: object = None
59
+ self.error: str | None = None
60
+ self.attempted: bool = False
61
+ # Allow one retry when the failure is a transient dependency install
62
+ # (network blip / registry timeout). A genuinely broken package fails on
63
+ # the retry too and is then cached permanently.
64
+ self.install_retry_remaining: int = 1
65
+
66
+
67
+ _whisper_state = _WhisperState()
68
+ _whisper_lock = Lock()
69
+ # A single medium model already holds ~1.5 GB; serialise inference so two
70
+ # concurrent transcriptions cannot push the worker over its memory limit.
71
+ _whisper_inference_sem = Semaphore(1)
72
+
73
+
74
+ def _get_whisper_model() -> tuple[object, str | None]:
75
+ """Return a cached WhisperModel, initializing it on the first call."""
76
+ if _whisper_state.model is not None or _whisper_state.error is not None:
77
+ return _whisper_state.model, _whisper_state.error
78
+ with _whisper_lock:
79
+ if _whisper_state.attempted:
80
+ return _whisper_state.model, _whisper_state.error
81
+ _whisper_state.attempted = True
82
+ try:
83
+ from ..sources.dependencies import require_module
84
+
85
+ whisper_module = require_module(
86
+ "faster_whisper",
87
+ "audio/video transcription",
88
+ ["transcription"],
89
+ detail="Transcription requires the faster-whisper optional dependency.",
90
+ )
91
+ cfg = get_whisper_config()
92
+ _whisper_state.model = whisper_module.WhisperModel(
93
+ cfg.model,
94
+ device=cfg.device,
95
+ compute_type=cfg.compute_type,
96
+ )
97
+ logger.info(
98
+ "Loaded faster-whisper model %s (device=%s, compute_type=%s)",
99
+ cfg.model,
100
+ cfg.device,
101
+ cfg.compute_type,
102
+ )
103
+ except Exception as exc:
104
+ from ..sources.dependencies import MissingSourceDependencyError
105
+
106
+ if (
107
+ isinstance(exc, MissingSourceDependencyError)
108
+ and _whisper_state.install_retry_remaining > 0
109
+ ):
110
+ _whisper_state.install_retry_remaining -= 1
111
+ _whisper_state.attempted = False
112
+ logger.warning(
113
+ "Transcription dependency install failed (may be transient); "
114
+ "will retry once: %s",
115
+ exc,
116
+ )
117
+ else:
118
+ _whisper_state.error = str(exc)
119
+ return _whisper_state.model, _whisper_state.error
120
+
121
+
122
+ def _reset_whisper_singleton() -> None:
123
+ """Reset the cached WhisperModel. Intended for test isolation only."""
124
+ with _whisper_lock:
125
+ _whisper_state.model = None
126
+ _whisper_state.error = None
127
+ _whisper_state.attempted = False
128
+ _whisper_state.install_retry_remaining = 1
129
+
130
+
131
+ def _temp_suffix(file_name: str, mime_type: str) -> str:
132
+ if file_name:
133
+ path = urlsplit(file_name).path or file_name
134
+ suffix = Path(path).suffix.lower()
135
+ if suffix:
136
+ return suffix
137
+ normalized = mime_type.split(";", 1)[0].strip().lower()
138
+ return _MIME_EXTENSION_HINTS.get(normalized, ".bin")
139
+
140
+
141
+ def _pcm_to_wav(pcm_bytes: bytes, sample_rate: int) -> bytes:
142
+ """Wrap raw int16 mono PCM bytes in a WAV container."""
143
+ buf = io.BytesIO()
144
+ with wave.open(buf, "wb") as wf:
145
+ wf.setnchannels(1)
146
+ wf.setsampwidth(2)
147
+ wf.setframerate(sample_rate)
148
+ wf.writeframes(pcm_bytes)
149
+ return buf.getvalue()
150
+
151
+
152
+ _AUDIO_CHUNK_SECONDS = 600 # 10-minute chunks → ~38 MB decoded audio per chunk
153
+ _TARGET_SAMPLE_RATE = 16_000
154
+
155
+
156
+ def _split_audio_chunks(
157
+ file_bytes: bytes,
158
+ chunk_seconds: int = _AUDIO_CHUNK_SECONDS,
159
+ ) -> Generator[bytes, None, None]:
160
+ """Decode audio bytes and yield WAV chunks via PyAV (bundled with faster-whisper).
161
+
162
+ Streams through the compressed audio frame-by-frame so only
163
+ ``chunk_seconds`` worth of decoded PCM is held in memory at once instead of
164
+ the full decoded duration. Falls back to yielding the original bytes when
165
+ PyAV is unavailable or decoding fails.
166
+ """
167
+ try:
168
+ import av as pyav # type: ignore[import-untyped]
169
+ except ImportError:
170
+ yield file_bytes
171
+ return
172
+
173
+ bytes_per_chunk = _TARGET_SAMPLE_RATE * chunk_seconds * 2 # int16 = 2 bytes/sample
174
+ current: bytearray = bytearray()
175
+
176
+ try:
177
+ container = pyav.open(io.BytesIO(file_bytes), metadata_errors="ignore")
178
+ audio_streams = [s for s in container.streams if s.type == "audio"]
179
+ if not audio_streams:
180
+ yield file_bytes
181
+ return
182
+
183
+ resampler = pyav.audio.resampler.AudioResampler(
184
+ format="s16", layout="mono", rate=_TARGET_SAMPLE_RATE
185
+ )
186
+
187
+ def _drain(frames: object) -> Generator[bytes, None, None]:
188
+ result = (
189
+ frames if isinstance(frames, list) else ([frames] if frames is not None else [])
190
+ )
191
+ for out_frame in result:
192
+ current.extend(bytes(out_frame.planes[0]))
193
+ while len(current) >= bytes_per_chunk:
194
+ yield _pcm_to_wav(bytes(current[:bytes_per_chunk]), _TARGET_SAMPLE_RATE)
195
+ del current[:bytes_per_chunk]
196
+
197
+ for frame in container.decode(audio_streams[0]):
198
+ yield from _drain(resampler.resample(frame))
199
+
200
+ # Flush the resampler's internal buffer.
201
+ try:
202
+ yield from _drain(resampler.resample(None))
203
+ except Exception:
204
+ pass
205
+
206
+ if current:
207
+ yield _pcm_to_wav(bytes(current), _TARGET_SAMPLE_RATE)
208
+
209
+ except Exception as exc:
210
+ logger.warning(
211
+ "Audio chunking failed (%s); falling back to full-file transcription: %s",
212
+ type(exc).__name__,
213
+ exc,
214
+ )
215
+ yield file_bytes
216
+
217
+
218
+ def iter_transcription_pages(
219
+ file_bytes: bytes,
220
+ *,
221
+ mime_type: str,
222
+ file_name: str = "",
223
+ segments_per_page: int = 50,
224
+ chunk_seconds: int = _AUDIO_CHUNK_SECONDS,
225
+ ) -> Generator[str, None, None]:
226
+ """Transcribe audio/video in chunks, yielding pages of transcript text.
227
+
228
+ Splits long audio into ``chunk_seconds``-long WAV chunks and transcribes
229
+ each under the inference semaphore, then yields batches of
230
+ ``segments_per_page`` whisper segments as each chunk completes. This lets
231
+ the detector start receiving text immediately and keeps peak decoded-audio
232
+ memory bounded to one chunk at a time.
233
+ """
234
+ if not file_bytes:
235
+ return
236
+
237
+ model, error = _get_whisper_model()
238
+ if error:
239
+ logger.warning("Whisper model unavailable for %s: %s", file_name or mime_type, error)
240
+ return
241
+ if model is None:
242
+ logger.warning("Whisper model not initialized for %s", file_name or mime_type)
243
+ return
244
+
245
+ cfg = get_whisper_config()
246
+ suffix = _temp_suffix(file_name, mime_type)
247
+
248
+ for chunk_index, chunk_bytes in enumerate(_split_audio_chunks(file_bytes, chunk_seconds), 1):
249
+ is_wav = chunk_bytes[:4] == b"RIFF"
250
+ chunk_suffix = ".wav" if is_wav else suffix
251
+ try:
252
+ with tempfile.TemporaryDirectory(prefix="classifyre-whisper-") as temp_dir:
253
+ temp_path = Path(temp_dir) / f"chunk{chunk_suffix}"
254
+ temp_path.write_bytes(chunk_bytes)
255
+ with _whisper_inference_sem:
256
+ segments, _info = model.transcribe( # type: ignore[attr-defined]
257
+ str(temp_path),
258
+ beam_size=cfg.beam_size,
259
+ vad_filter=cfg.vad_filter,
260
+ word_timestamps=cfg.word_timestamps,
261
+ )
262
+ page: list[str] = []
263
+ total_chars = 0
264
+ for segment in segments:
265
+ text = segment.text.strip()
266
+ if text:
267
+ page.append(text)
268
+ total_chars += len(text)
269
+ if len(page) >= segments_per_page:
270
+ yield "\n".join(page)
271
+ page = []
272
+ if page:
273
+ yield "\n".join(page)
274
+ logger.info(
275
+ "Transcribed chunk %d: %d chars from %s (%s)",
276
+ chunk_index,
277
+ total_chars,
278
+ file_name or mime_type,
279
+ mime_type,
280
+ )
281
+ except Exception as exc:
282
+ logger.warning(
283
+ "Transcription failed for chunk %d of %s: %s",
284
+ chunk_index,
285
+ file_name or mime_type,
286
+ exc,
287
+ )
288
+ raise
289
+
290
+
291
+ def transcribe_media(
292
+ file_bytes: bytes,
293
+ *,
294
+ mime_type: str,
295
+ file_name: str = "",
296
+ ) -> tuple[str, str | None]:
297
+ """Transcribe audio/video bytes to text (full transcript returned at once).
298
+
299
+ Prefer ``iter_transcription_pages`` when processing long files; this
300
+ function buffers the entire transcript before returning.
301
+ """
302
+ if not file_bytes:
303
+ return "", None
304
+
305
+ model, model_error = _get_whisper_model()
306
+ if model_error:
307
+ return "", model_error
308
+ if model is None:
309
+ return "", "Whisper model not initialized"
310
+
311
+ try:
312
+ pages = list(iter_transcription_pages(file_bytes, mime_type=mime_type, file_name=file_name))
313
+ except Exception as exc:
314
+ logger.warning("Transcription failed for %s: %s", file_name or mime_type, exc)
315
+ return "", f"Transcription failed: {exc}"
316
+
317
+ text = "\n".join(pages)
318
+ if text:
319
+ logger.info(
320
+ "Transcribed %d chars from %s (%s)",
321
+ len(text),
322
+ file_name or mime_type,
323
+ mime_type,
324
+ )
325
+ return text, None