classifyre-cli 0.4.22__tar.gz → 0.4.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/.turbo/turbo-build.log +1 -1
  2. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/package.json +1 -1
  4. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/pyproject.toml +1 -1
  5. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/pipeline/detector_pipeline.py +45 -3
  6. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/pipeline/parsed_content_provider.py +22 -0
  7. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/pipeline/worker_pool.py +3 -57
  8. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/object_storage/base.py +72 -14
  9. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/utils/file_parser.py +12 -0
  10. classifyre_cli-0.4.24/src/utils/resources.py +65 -0
  11. classifyre_cli-0.4.24/src/utils/transcription.py +383 -0
  12. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/uv.lock +113 -122
  13. classifyre_cli-0.4.22/src/utils/transcription.py +0 -177
  14. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/.gitignore +0 -0
  15. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/.python-version +0 -0
  16. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/README.md +0 -0
  17. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/main.py +0 -0
  18. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/scripts/generate_models.py +0 -0
  19. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/__init__.py +0 -0
  20. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/config.py +0 -0
  21. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/__init__.py +0 -0
  22. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/base.py +0 -0
  23. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/broken_links/__init__.py +0 -0
  24. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/broken_links/detector.py +0 -0
  25. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/config.py +0 -0
  26. {classifyre_cli-0.4.22/tests/detectors/threat → classifyre_cli-0.4.24/src/detectors/content}/__init__.py +0 -0
  27. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/__init__.py +0 -0
  28. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/detector.py +0 -0
  29. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/extractor.py +0 -0
  30. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/runners/__init__.py +0 -0
  31. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/runners/_base.py +0 -0
  32. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/runners/_factory.py +0 -0
  33. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  34. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/runners/_gliner2.py +0 -0
  35. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/runners/_image_classification.py +0 -0
  36. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/runners/_llm.py +0 -0
  37. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/runners/_object_detection.py +0 -0
  38. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/runners/_regex.py +0 -0
  39. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/runners/_text_classification.py +0 -0
  40. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/custom/trainer.py +0 -0
  41. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/dependencies.py +0 -0
  42. {classifyre_cli-0.4.22/tests/detectors/secrets → classifyre_cli-0.4.24/src/detectors/pii}/__init__.py +0 -0
  43. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/pii/detector.py +0 -0
  44. {classifyre_cli-0.4.22/tests/detectors/pii → classifyre_cli-0.4.24/src/detectors/secrets}/__init__.py +0 -0
  45. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/secrets/detector.py +0 -0
  46. {classifyre_cli-0.4.22/tests/detectors/custom → classifyre_cli-0.4.24/src/detectors/threat}/__init__.py +0 -0
  47. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/threat/code_security_detector.py +0 -0
  48. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/detectors/threat/yara_detector.py +0 -0
  49. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/main.py +0 -0
  50. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/models/generated_detectors.py +0 -0
  51. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/models/generated_input.py +0 -0
  52. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/models/generated_single_asset_scan_results.py +0 -0
  53. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/outputs/__init__.py +0 -0
  54. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/outputs/base.py +0 -0
  55. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/outputs/console.py +0 -0
  56. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/outputs/factory.py +0 -0
  57. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/outputs/file.py +0 -0
  58. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/outputs/rest.py +0 -0
  59. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/pipeline/__init__.py +0 -0
  60. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/pipeline/content_provider.py +0 -0
  61. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sandbox/__init__.py +0 -0
  62. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sandbox/runner.py +0 -0
  63. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/__init__.py +0 -0
  64. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/asset_metadata.py +0 -0
  65. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/atlassian_common.py +0 -0
  66. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/azure_blob_storage/__init__.py +0 -0
  67. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/azure_blob_storage/source.py +0 -0
  68. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/base.py +0 -0
  69. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/confluence/__init__.py +0 -0
  70. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/confluence/source.py +0 -0
  71. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/databricks/__init__.py +0 -0
  72. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/databricks/source.py +0 -0
  73. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/dependencies.py +0 -0
  74. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/email/__init__.py +0 -0
  75. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/email/source.py +0 -0
  76. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/google_cloud_storage/__init__.py +0 -0
  77. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/google_cloud_storage/source.py +0 -0
  78. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/hive/__init__.py +0 -0
  79. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/hive/source.py +0 -0
  80. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/jira/__init__.py +0 -0
  81. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/jira/source.py +0 -0
  82. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/mongodb/__init__.py +0 -0
  83. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/mongodb/source.py +0 -0
  84. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/mssql/__init__.py +0 -0
  85. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/mssql/source.py +0 -0
  86. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/mysql/__init__.py +0 -0
  87. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/mysql/source.py +0 -0
  88. {classifyre_cli-0.4.22/tests/detectors/content → classifyre_cli-0.4.24/src/sources/neo4j}/__init__.py +0 -0
  89. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/neo4j/source.py +0 -0
  90. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/notion/__init__.py +0 -0
  91. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/notion/client.py +0 -0
  92. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/notion/source.py +0 -0
  93. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/oracle/__init__.py +0 -0
  94. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/oracle/source.py +0 -0
  95. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/postgresql/__init__.py +0 -0
  96. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/postgresql/source.py +0 -0
  97. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/powerbi/__init__.py +0 -0
  98. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/powerbi/source.py +0 -0
  99. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/recipe_normalizer.py +0 -0
  100. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/s3_compatible_storage/README.md +0 -0
  101. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/s3_compatible_storage/__init__.py +0 -0
  102. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/s3_compatible_storage/source.py +0 -0
  103. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/servicedesk/__init__.py +0 -0
  104. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/servicedesk/source.py +0 -0
  105. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/slack/__init__.py +0 -0
  106. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/slack/source.py +0 -0
  107. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/snowflake/__init__.py +0 -0
  108. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/snowflake/source.py +0 -0
  109. {classifyre_cli-0.4.22/tests/detectors → classifyre_cli-0.4.24/src/sources/sqlite}/__init__.py +0 -0
  110. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/sqlite/source.py +0 -0
  111. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/tableau/__init__.py +0 -0
  112. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/tableau/source.py +0 -0
  113. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/tabular_base.py +0 -0
  114. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/tabular_utils.py +0 -0
  115. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/wordpress/__init__.py +0 -0
  116. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/wordpress/source.py +0 -0
  117. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/youtube/__init__.py +0 -0
  118. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/youtube/source.py +0 -0
  119. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/telemetry.py +0 -0
  120. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/utils/__init__.py +0 -0
  121. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/utils/content_extraction.py +0 -0
  122. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/utils/dependency_groups.py +0 -0
  123. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/utils/embedded_images.py +0 -0
  124. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/utils/file_metadata.py +0 -0
  125. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/utils/file_to_images.py +0 -0
  126. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/utils/hashing.py +0 -0
  127. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/utils/uv_sync.py +0 -0
  128. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/utils/validation.py +0 -0
  129. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/__init__.py +0 -0
  130. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/conftest.py +0 -0
  131. {classifyre_cli-0.4.22/src/sources/sqlite → classifyre_cli-0.4.24/tests/detectors}/__init__.py +0 -0
  132. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  133. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/conftest.py +0 -0
  134. {classifyre_cli-0.4.22/src/sources/neo4j → classifyre_cli-0.4.24/tests/detectors/content}/__init__.py +0 -0
  135. {classifyre_cli-0.4.22/src/detectors/threat → classifyre_cli-0.4.24/tests/detectors/custom}/__init__.py +0 -0
  136. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/custom/conftest.py +0 -0
  137. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  138. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/custom/test_llm_runner.py +0 -0
  139. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  140. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/custom/test_regex_runner.py +0 -0
  141. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/custom/test_transformer_runners.py +0 -0
  142. {classifyre_cli-0.4.22/src/detectors/secrets → classifyre_cli-0.4.24/tests/detectors/pii}/__init__.py +0 -0
  143. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/pii/conftest.py +0 -0
  144. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/pii/sample_invoice.pdf +0 -0
  145. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/pii/test_pii_detector.py +0 -0
  146. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  147. {classifyre_cli-0.4.22/src/detectors/pii → classifyre_cli-0.4.24/tests/detectors/secrets}/__init__.py +0 -0
  148. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  149. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  150. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/test_base_detector.py +0 -0
  151. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  152. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  153. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/test_detector_pipeline_types.py +0 -0
  154. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/test_detector_schema_examples.py +0 -0
  155. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/test_detector_types.py +0 -0
  156. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/test_phase2_detectors.py +0 -0
  157. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/test_registry.py +0 -0
  158. {classifyre_cli-0.4.22/src/detectors/content → classifyre_cli-0.4.24/tests/detectors/threat}/__init__.py +0 -0
  159. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/threat/test_code_security_detector.py +0 -0
  160. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/detectors/threat/test_yara_detector.py +0 -0
  161. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  162. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/integration/test_wordpress_links_assets.py +0 -0
  163. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/pipeline/test_detector_pipeline.py +0 -0
  164. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/pipeline/test_worker_pool.py +0 -0
  165. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_assets_metadata_catalog.py +0 -0
  166. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_azure_blob_storage_source.py +0 -0
  167. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_base_source_attachment.py +0 -0
  168. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_base_source_sampling.py +0 -0
  169. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_config.py +0 -0
  170. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_confluence_source.py +0 -0
  171. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_custom_extractor.py +0 -0
  172. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_databricks_source.py +0 -0
  173. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_dependency_groups.py +0 -0
  174. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_email_source.py +0 -0
  175. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_google_cloud_storage_source.py +0 -0
  176. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_hashing.py +0 -0
  177. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_hive_source.py +0 -0
  178. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_jira_source.py +0 -0
  179. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_mongodb_source.py +0 -0
  180. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_mssql_source.py +0 -0
  181. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_mysql_source.py +0 -0
  182. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_neo4j_source.py +0 -0
  183. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_notion_source.py +0 -0
  184. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_oracle_source.py +0 -0
  185. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_outputs.py +0 -0
  186. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_postgresql_source.py +0 -0
  187. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_powerbi_source.py +0 -0
  188. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_recipe_normalizer.py +0 -0
  189. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_s3_compatible_storage_source.py +0 -0
  190. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_sandbox_runner.py +0 -0
  191. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_servicedesk_source.py +0 -0
  192. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_slack_source.py +0 -0
  193. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_snowflake_source.py +0 -0
  194. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_source_dependency_groups.py +0 -0
  195. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_sqlite_source.py +0 -0
  196. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_tableau_source.py +0 -0
  197. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_tabular_utils.py +0 -0
  198. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_uv_sync.py +0 -0
  199. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_wordpress_source.py +0 -0
  200. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_youtube_source.py +0 -0
  201. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/test_youtube_source_integration.py +0 -0
  202. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/utils/test_content_extraction.py +0 -0
  203. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/utils/test_embedded_images.py +0 -0
  204. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/utils/test_file_metadata.py +0 -0
  205. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/utils/test_file_parser.py +0 -0
  206. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/utils/test_file_to_images.py +0 -0
  207. {classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/tests/utils/test_transcription.py +0 -0
@@ -1,3 +1,3 @@
1
1
  $ uv sync
2
- Resolved 262 packages in 141ms
2
+ Resolved 262 packages in 149ms
3
3
  Checked 50 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.22
3
+ Version: 0.4.24
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.22",
3
+ "version": "0.4.24",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.22"
3
+ version = "0.4.24"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -109,7 +109,10 @@ class DetectorPipeline:
109
109
 
110
110
  scan_started = datetime.now(UTC)
111
111
  ocr_enabled = self.source.ocr_enabled()
112
- text_content_type = self._text_content_type_for_asset(asset.asset_type, ocr_enabled)
112
+ transcription_enabled = self.source.transcription_enabled()
113
+ text_content_type = self._text_content_type_for_asset(
114
+ asset.asset_type, ocr_enabled, transcription_enabled
115
+ )
113
116
  link_content = self._build_links_payload(asset.links)
114
117
 
115
118
  text_detectors = []
@@ -294,13 +297,23 @@ class DetectorPipeline:
294
297
  page_num=page_num,
295
298
  )
296
299
  elapsed = int((time.monotonic() - t0) * 1000)
300
+ snippet = page_content[:120].replace("\n", "\\n") if page_content else ""
297
301
  logger.info(
298
- " %s page %d done: %d findings (%dms)",
302
+ " %s page %d: %d findings in %dms — snippet: %s",
299
303
  asset.name,
300
304
  page_num,
301
305
  len(page_findings),
302
306
  elapsed,
307
+ snippet,
303
308
  )
309
+ if page_findings:
310
+ for f in page_findings[:5]:
311
+ logger.info(
312
+ " finding: type=%s detector=%s matched=%.100s",
313
+ f.finding_type,
314
+ f.detector_type,
315
+ f.matched_content[:100].replace("\n", " "),
316
+ )
304
317
  return page_findings, page_types, page_errors, page_content, page_num
305
318
 
306
319
  def _collect_done() -> None:
@@ -401,13 +414,23 @@ class DetectorPipeline:
401
414
  page_num=page_num,
402
415
  )
403
416
  elapsed = int((time.monotonic() - t0) * 1000)
417
+ snippet = page_content[:120].replace("\n", "\\n") if page_content else ""
404
418
  logger.info(
405
- " %s page %d done: %d findings (%dms)",
419
+ " %s page %d: %d findings in %dms — snippet: %s",
406
420
  asset.name,
407
421
  page_num,
408
422
  len(page_findings),
409
423
  elapsed,
424
+ snippet,
410
425
  )
426
+ if page_findings:
427
+ for f in page_findings[:5]:
428
+ logger.info(
429
+ " finding: type=%s detector=%s matched=%.100s",
430
+ f.finding_type,
431
+ f.detector_type,
432
+ f.matched_content[:100].replace("\n", " "),
433
+ )
411
434
  return page_findings, page_types, page_errors, page_content, page_num
412
435
 
413
436
  async def _collect_done_and_flush(min_findings: int = 1) -> None:
@@ -488,6 +511,12 @@ class DetectorPipeline:
488
511
  continue
489
512
  candidate_ids.append(value)
490
513
 
514
+ logger.info(
515
+ "_iter_text_content_pages(%s): trying candidates %s",
516
+ asset.name,
517
+ candidate_ids,
518
+ )
519
+
491
520
  for candidate_id in candidate_ids:
492
521
  saw_candidate_content = False
493
522
  async for text_content in self.content_provider.fetch_text_pages(candidate_id):
@@ -499,6 +528,16 @@ class DetectorPipeline:
499
528
  if saw_candidate_content:
500
529
  return
501
530
 
531
+ # If fetch_content_pages ran the full bytes-path extraction (even
532
+ # yielding 0 text, e.g. silent audio), the source already did the
533
+ # expensive work. Don't re-process with another candidate ID for
534
+ # the same asset.
535
+ source = getattr(self.content_provider, "_source", None)
536
+ if source is not None:
537
+ processed: set[str] = getattr(source, "_content_pages_processed", set())
538
+ if candidate_id in processed:
539
+ return
540
+
502
541
  async def _run_binary_detectors_for_asset(
503
542
  self,
504
543
  *,
@@ -727,6 +766,7 @@ class DetectorPipeline:
727
766
  self,
728
767
  asset_type: OutputAssetType,
729
768
  ocr_enabled: bool,
769
+ transcription_enabled: bool = False,
730
770
  ) -> str | None:
731
771
  mapping = {
732
772
  OutputAssetType.TXT: "text/plain",
@@ -737,6 +777,8 @@ class DetectorPipeline:
737
777
  return mapping[asset_type]
738
778
  if ocr_enabled and asset_type in {OutputAssetType.IMAGE, OutputAssetType.BINARY}:
739
779
  return "text/plain"
780
+ if transcription_enabled and asset_type in {OutputAssetType.AUDIO, OutputAssetType.VIDEO}:
781
+ return "text/plain"
740
782
  return None
741
783
 
742
784
  @staticmethod
@@ -3,11 +3,14 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import asyncio
6
+ import logging
6
7
  from collections.abc import AsyncGenerator
7
8
 
8
9
  from ..models.generated_single_asset_scan_results import DetectionResult, SingleAssetScanResults
9
10
  from ..sources.base import BaseSource
10
11
 
12
+ logger = logging.getLogger(__name__)
13
+
11
14
 
12
15
  class ParsedContentProvider:
13
16
  """
@@ -32,11 +35,30 @@ class ParsedContentProvider:
32
35
  if saw_text:
33
36
  return
34
37
 
38
+ # If fetch_content_pages already ran the full extraction pipeline for
39
+ # this asset (tracked via _content_pages_processed), skip the fallback
40
+ # iter_asset_pages call. Without this, an all-silence audio file would
41
+ # trigger a redundant second transcription pass.
42
+ pages_processed: set[str] | None = getattr(self._source, "_content_pages_processed", None)
43
+ if isinstance(pages_processed, set) and asset_id in pages_processed:
44
+ logger.info(
45
+ "fetch_text_pages(%s): source already processed, skipping fallback",
46
+ asset_id,
47
+ )
48
+ return
49
+
35
50
  result = await self._source.fetch_content_bytes(asset_id)
36
51
  if result is None:
52
+ logger.info("fetch_text_pages(%s): fetch_content_bytes returned None", asset_id)
37
53
  return
38
54
 
39
55
  raw_bytes, mime = result
56
+ logger.info(
57
+ "fetch_text_pages(%s): fallback iter_asset_pages path (%s, %d bytes)",
58
+ asset_id,
59
+ mime,
60
+ len(raw_bytes),
61
+ )
40
62
  pages: list[str] = await asyncio.to_thread(
41
63
  list,
42
64
  self._source.iter_asset_pages(raw_bytes, mime),
@@ -21,6 +21,9 @@ from concurrent.futures import ProcessPoolExecutor
21
21
  from typing import Any
22
22
 
23
23
  from ..models.generated_single_asset_scan_results import DetectionResult
24
+ from ..utils.resources import get_effective_cpu_count, get_effective_memory_mb
25
+
26
+ __all__ = ["get_effective_cpu_count", "get_effective_memory_mb"]
24
27
 
25
28
  logger = logging.getLogger(__name__)
26
29
 
@@ -130,63 +133,6 @@ def is_io_bound_detector(detector_name: str) -> bool:
130
133
  return detector_name in _IO_BOUND_DETECTORS
131
134
 
132
135
 
133
- def get_effective_cpu_count() -> int:
134
- """Return the number of usable CPUs, respecting cgroup limits (K8s/Docker).
135
-
136
- ``os.cpu_count()`` returns the *host* CPU count, which can be much larger
137
- than what the container is allowed to use. This function reads the cgroup
138
- v2 ``cpu.max`` (or v1 ``cpu.cfs_quota_us``/``cpu.cfs_period_us``) to
139
- determine the actual allocation.
140
- """
141
- try:
142
- data = open("/sys/fs/cgroup/cpu.max").read().strip()
143
- quota_str, period_str = data.split()
144
- if quota_str != "max":
145
- cpus = int(quota_str) / int(period_str)
146
- if cpus >= 0.5:
147
- return max(1, int(cpus))
148
- except (FileNotFoundError, OSError, ValueError):
149
- pass
150
-
151
- try:
152
- quota = int(open("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read().strip())
153
- period = int(open("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read().strip())
154
- if quota > 0 and period > 0:
155
- cpus = quota / period
156
- if cpus >= 0.5:
157
- return max(1, int(cpus))
158
- except (FileNotFoundError, OSError, ValueError):
159
- pass
160
-
161
- return os.cpu_count() or 4
162
-
163
-
164
- def get_effective_memory_mb() -> int:
165
- """Return usable memory in MB, respecting cgroup limits."""
166
- try:
167
- mem_bytes = int(open("/sys/fs/cgroup/memory.max").read().strip())
168
- if mem_bytes < 2**50:
169
- return max(256, mem_bytes // (1024 * 1024))
170
- except (FileNotFoundError, OSError, ValueError):
171
- pass
172
-
173
- try:
174
- mem_bytes = int(open("/sys/fs/cgroup/memory/memory.limit_in_bytes").read().strip())
175
- if mem_bytes < 2**50:
176
- return max(256, mem_bytes // (1024 * 1024))
177
- except (FileNotFoundError, OSError, ValueError):
178
- pass
179
-
180
- try:
181
- for line in open("/proc/meminfo"):
182
- if line.startswith("MemTotal:"):
183
- return max(256, int(line.split()[1]) // 1024)
184
- except (FileNotFoundError, OSError, ValueError):
185
- pass
186
-
187
- return 4096
188
-
189
-
190
136
  def compute_pool_workers(override: int | None = None) -> int:
191
137
  """Compute optimal pool size from actual resource limits.
192
138
 
@@ -135,6 +135,11 @@ class ObjectStorageSourceBase(BaseSource, ABC):
135
135
  # Keyed by both asset_hash and external_url for O(1) lookup from either.
136
136
  self._bytes_cache: dict[str, bytes] = {}
137
137
  self._mime_cache: dict[str, str] = {}
138
+ # asset_ids for which fetch_content_pages ran the full bytes path
139
+ # (even if it produced no text, e.g. all-silence audio). Checked by
140
+ # ParsedContentProvider to skip its fallback iter_asset_pages path,
141
+ # which would otherwise re-run an expensive transcription a second time.
142
+ self._content_pages_processed: set[str] = set()
138
143
  # Child IMAGE assets queued while transforming the current object.
139
144
  self._pending_child_assets: list[SingleAssetScanResults] = []
140
145
 
@@ -302,6 +307,15 @@ class ObjectStorageSourceBase(BaseSource, ABC):
302
307
 
303
308
  return OutputAssetType.OTHER
304
309
 
310
+ @staticmethod
311
+ def _asset_kind_for_asset_type(asset_type: OutputAssetType) -> str:
312
+ mapping: dict[OutputAssetType, str] = {
313
+ OutputAssetType.IMAGE: "image",
314
+ OutputAssetType.AUDIO: "audio",
315
+ OutputAssetType.VIDEO: "video",
316
+ }
317
+ return mapping.get(asset_type, "file")
318
+
305
319
  def _ensure_file_processing_dependencies(self) -> None:
306
320
  if self._file_processing_deps_checked:
307
321
  return
@@ -446,7 +460,7 @@ class ObjectStorageSourceBase(BaseSource, ABC):
446
460
  created_at=ref.last_modified,
447
461
  updated_at=ref.last_modified,
448
462
  runner_id=self.runner_id,
449
- **self.metadata_fields("file", asset_metadata),
463
+ **self.metadata_fields(self._asset_kind_for_asset_type(asset_type), asset_metadata),
450
464
  )
451
465
  self._hash_to_uri[asset_hash] = external_url
452
466
  self._object_ref_by_hash[asset_hash] = ref
@@ -549,6 +563,7 @@ class ObjectStorageSourceBase(BaseSource, ABC):
549
563
  self._object_ref_by_hash = {}
550
564
  self._bytes_cache = {}
551
565
  self._mime_cache = {}
566
+ self._content_pages_processed = set()
552
567
  self._pending_child_assets = []
553
568
 
554
569
  refs = self._list_objects()
@@ -628,26 +643,69 @@ class ObjectStorageSourceBase(BaseSource, ABC):
628
643
  raw_bytes = self._bytes_cache.get(asset_id)
629
644
  mime = self._mime_cache.get(asset_id, "")
630
645
 
646
+ logger.info(
647
+ "fetch_content_pages(%s): raw_bytes=%s mime=%s processed=%s",
648
+ asset_id,
649
+ f"{len(raw_bytes)} bytes" if raw_bytes is not None else "MISS",
650
+ mime or "MISS",
651
+ asset_id in self._content_pages_processed,
652
+ )
653
+
631
654
  if raw_bytes is not None:
632
655
  sampling = self.config.sampling
633
656
  batch_size = int(sampling.rows_per_page or 100)
634
657
  include_col_names = bool(
635
658
  sampling.include_column_names if sampling.include_column_names is not None else True
636
659
  )
637
- # Run the (potentially blocking) file parsing in a thread so pyarrow /
638
- # pdfplumber can't freeze the event loop during large file iteration.
639
- pages: list[str] = await asyncio.to_thread(
640
- list,
641
- self.iter_asset_pages(
642
- raw_bytes,
643
- mime,
644
- batch_size,
645
- include_col_names,
646
- file_name=self._file_name_for_asset_id(asset_id),
647
- ),
660
+ file_name = self._file_name_for_asset_id(asset_id)
661
+
662
+ # Stream pages from a thread instead of materializing via list().
663
+ # For transcription this lets detectors start working on the first
664
+ # chunk while later chunks are still being transcribed.
665
+ loop = asyncio.get_running_loop()
666
+ queue: asyncio.Queue[str | None] = asyncio.Queue()
667
+
668
+ exc_info: list[BaseException | None] = [None]
669
+
670
+ page_count: int = 0
671
+
672
+ def _produce() -> None:
673
+ nonlocal page_count
674
+ try:
675
+ for page in self.iter_asset_pages(
676
+ raw_bytes,
677
+ mime,
678
+ batch_size,
679
+ include_col_names,
680
+ file_name=file_name,
681
+ ):
682
+ loop.call_soon_threadsafe(queue.put_nowait, page)
683
+ page_count += 1
684
+ except BaseException as exc:
685
+ exc_info[0] = exc
686
+ finally:
687
+ loop.call_soon_threadsafe(queue.put_nowait, None)
688
+
689
+ task = loop.run_in_executor(None, _produce)
690
+
691
+ while True:
692
+ page = await queue.get()
693
+ if page is None:
694
+ break
695
+ yield "", page
696
+
697
+ await task
698
+ if exc_info[0] is not None:
699
+ raise exc_info[0] # type: ignore[misc]
700
+
701
+ logger.info(
702
+ "fetch_content_pages(%s): streamed %d page(s) from %s",
703
+ asset_id,
704
+ page_count,
705
+ file_name,
648
706
  )
649
- for batch_text in pages:
650
- yield "", batch_text
707
+
708
+ self._content_pages_processed.add(asset_id)
651
709
  return
652
710
 
653
711
  result = await self.fetch_content(asset_id)
@@ -690,6 +690,18 @@ def iter_file_pages(
690
690
  yield from _iter_parquet_pages(file_bytes, batch_size, include_column_names)
691
691
  elif normalized in ("text/csv", "text/tab-separated-values"):
692
692
  yield from _iter_csv_pages(file_bytes, include_column_names)
693
+ elif normalized.startswith(("audio/", "video/")) and enable_transcription:
694
+ # Stream transcript pages directly from the chunked transcription pipeline
695
+ # so the detector receives text as each ~10-min audio chunk completes
696
+ # instead of waiting for the full file and buffering the entire transcript.
697
+ from .transcription import iter_transcription_pages
698
+
699
+ yield from iter_transcription_pages(
700
+ file_bytes,
701
+ mime_type=normalized,
702
+ file_name=file_name,
703
+ segments_per_page=batch_size,
704
+ )
693
705
  else:
694
706
  text, error = extract_text(
695
707
  file_bytes,
@@ -0,0 +1,65 @@
1
+ """Cgroup-aware CPU and memory introspection.
2
+
3
+ Shared by the detector worker pool (to size the process pool) and the
4
+ transcription pipeline (to select the right Whisper model at runtime).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+
11
+
12
+ def get_effective_cpu_count() -> int:
13
+ """Return usable CPUs, respecting cgroup limits (K8s / Docker).
14
+
15
+ ``os.cpu_count()`` returns the *host* count, which is usually much larger
16
+ than the container's CPU quota. This reads cgroup v2 / v1 to get the
17
+ actual allocation.
18
+ """
19
+ try:
20
+ data = open("/sys/fs/cgroup/cpu.max").read().strip()
21
+ quota_str, period_str = data.split()
22
+ if quota_str != "max":
23
+ cpus = int(quota_str) / int(period_str)
24
+ if cpus >= 0.5:
25
+ return max(1, int(cpus))
26
+ except (FileNotFoundError, OSError, ValueError):
27
+ pass
28
+
29
+ try:
30
+ quota = int(open("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read().strip())
31
+ period = int(open("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read().strip())
32
+ if quota > 0 and period > 0:
33
+ cpus = quota / period
34
+ if cpus >= 0.5:
35
+ return max(1, int(cpus))
36
+ except (FileNotFoundError, OSError, ValueError):
37
+ pass
38
+
39
+ return os.cpu_count() or 4
40
+
41
+
42
+ def get_effective_memory_mb() -> int:
43
+ """Return usable memory in MB, respecting cgroup limits (K8s / Docker)."""
44
+ try:
45
+ mem_bytes = int(open("/sys/fs/cgroup/memory.max").read().strip())
46
+ if mem_bytes < 2**50:
47
+ return max(256, mem_bytes // (1024 * 1024))
48
+ except (FileNotFoundError, OSError, ValueError):
49
+ pass
50
+
51
+ try:
52
+ mem_bytes = int(open("/sys/fs/cgroup/memory/memory.limit_in_bytes").read().strip())
53
+ if mem_bytes < 2**50:
54
+ return max(256, mem_bytes // (1024 * 1024))
55
+ except (FileNotFoundError, OSError, ValueError):
56
+ pass
57
+
58
+ try:
59
+ for line in open("/proc/meminfo"):
60
+ if line.startswith("MemTotal:"):
61
+ return max(256, int(line.split()[1]) // 1024)
62
+ except (FileNotFoundError, OSError, ValueError):
63
+ pass
64
+
65
+ return 4096