classifyre-cli 0.4.23__tar.gz → 0.4.25__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/.turbo/turbo-build.log +1 -1
  2. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/package.json +1 -1
  4. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/pyproject.toml +1 -1
  5. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/models/generated_input.py +56 -36
  6. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/pipeline/detector_pipeline.py +38 -2
  7. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/pipeline/parsed_content_provider.py +22 -0
  8. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/pipeline/worker_pool.py +3 -57
  9. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/object_storage/base.py +72 -14
  10. classifyre_cli-0.4.25/src/utils/resources.py +65 -0
  11. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/transcription.py +61 -3
  12. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/uv.lock +158 -172
  13. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/.gitignore +0 -0
  14. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/.python-version +0 -0
  15. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/README.md +0 -0
  16. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/main.py +0 -0
  17. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/scripts/generate_models.py +0 -0
  18. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/__init__.py +0 -0
  19. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/config.py +0 -0
  20. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/__init__.py +0 -0
  21. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/base.py +0 -0
  22. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/broken_links/__init__.py +0 -0
  23. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/broken_links/detector.py +0 -0
  24. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/config.py +0 -0
  25. {classifyre_cli-0.4.23/tests/detectors/threat → classifyre_cli-0.4.25/src/detectors/content}/__init__.py +0 -0
  26. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/__init__.py +0 -0
  27. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/detector.py +0 -0
  28. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/extractor.py +0 -0
  29. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/__init__.py +0 -0
  30. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_base.py +0 -0
  31. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_factory.py +0 -0
  32. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  33. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_gliner2.py +0 -0
  34. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_image_classification.py +0 -0
  35. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_llm.py +0 -0
  36. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_object_detection.py +0 -0
  37. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_regex.py +0 -0
  38. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_text_classification.py +0 -0
  39. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/trainer.py +0 -0
  40. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/dependencies.py +0 -0
  41. {classifyre_cli-0.4.23/tests/detectors/secrets → classifyre_cli-0.4.25/src/detectors/pii}/__init__.py +0 -0
  42. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/pii/detector.py +0 -0
  43. {classifyre_cli-0.4.23/tests/detectors/pii → classifyre_cli-0.4.25/src/detectors/secrets}/__init__.py +0 -0
  44. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/secrets/detector.py +0 -0
  45. {classifyre_cli-0.4.23/tests/detectors/custom → classifyre_cli-0.4.25/src/detectors/threat}/__init__.py +0 -0
  46. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/threat/code_security_detector.py +0 -0
  47. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/threat/yara_detector.py +0 -0
  48. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/main.py +0 -0
  49. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/models/generated_detectors.py +0 -0
  50. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/models/generated_single_asset_scan_results.py +0 -0
  51. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/outputs/__init__.py +0 -0
  52. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/outputs/base.py +0 -0
  53. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/outputs/console.py +0 -0
  54. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/outputs/factory.py +0 -0
  55. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/outputs/file.py +0 -0
  56. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/outputs/rest.py +0 -0
  57. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/pipeline/__init__.py +0 -0
  58. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/pipeline/content_provider.py +0 -0
  59. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sandbox/__init__.py +0 -0
  60. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sandbox/runner.py +0 -0
  61. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/__init__.py +0 -0
  62. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/asset_metadata.py +0 -0
  63. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/atlassian_common.py +0 -0
  64. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/azure_blob_storage/__init__.py +0 -0
  65. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/azure_blob_storage/source.py +0 -0
  66. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/base.py +0 -0
  67. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/confluence/__init__.py +0 -0
  68. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/confluence/source.py +0 -0
  69. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/databricks/__init__.py +0 -0
  70. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/databricks/source.py +0 -0
  71. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/dependencies.py +0 -0
  72. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/email/__init__.py +0 -0
  73. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/email/source.py +0 -0
  74. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/google_cloud_storage/__init__.py +0 -0
  75. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/google_cloud_storage/source.py +0 -0
  76. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/hive/__init__.py +0 -0
  77. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/hive/source.py +0 -0
  78. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/jira/__init__.py +0 -0
  79. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/jira/source.py +0 -0
  80. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/mongodb/__init__.py +0 -0
  81. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/mongodb/source.py +0 -0
  82. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/mssql/__init__.py +0 -0
  83. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/mssql/source.py +0 -0
  84. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/mysql/__init__.py +0 -0
  85. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/mysql/source.py +0 -0
  86. {classifyre_cli-0.4.23/tests/detectors/content → classifyre_cli-0.4.25/src/sources/neo4j}/__init__.py +0 -0
  87. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/neo4j/source.py +0 -0
  88. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/notion/__init__.py +0 -0
  89. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/notion/client.py +0 -0
  90. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/notion/source.py +0 -0
  91. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/oracle/__init__.py +0 -0
  92. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/oracle/source.py +0 -0
  93. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/postgresql/__init__.py +0 -0
  94. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/postgresql/source.py +0 -0
  95. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/powerbi/__init__.py +0 -0
  96. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/powerbi/source.py +0 -0
  97. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/recipe_normalizer.py +0 -0
  98. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/s3_compatible_storage/README.md +0 -0
  99. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/s3_compatible_storage/__init__.py +0 -0
  100. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/s3_compatible_storage/source.py +0 -0
  101. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/servicedesk/__init__.py +0 -0
  102. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/servicedesk/source.py +0 -0
  103. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/slack/__init__.py +0 -0
  104. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/slack/source.py +0 -0
  105. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/snowflake/__init__.py +0 -0
  106. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/snowflake/source.py +0 -0
  107. {classifyre_cli-0.4.23/tests/detectors → classifyre_cli-0.4.25/src/sources/sqlite}/__init__.py +0 -0
  108. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/sqlite/source.py +0 -0
  109. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/tableau/__init__.py +0 -0
  110. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/tableau/source.py +0 -0
  111. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/tabular_base.py +0 -0
  112. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/tabular_utils.py +0 -0
  113. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/wordpress/__init__.py +0 -0
  114. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/wordpress/source.py +0 -0
  115. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/youtube/__init__.py +0 -0
  116. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/youtube/source.py +0 -0
  117. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/telemetry.py +0 -0
  118. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/__init__.py +0 -0
  119. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/content_extraction.py +0 -0
  120. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/dependency_groups.py +0 -0
  121. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/embedded_images.py +0 -0
  122. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/file_metadata.py +0 -0
  123. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/file_parser.py +0 -0
  124. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/file_to_images.py +0 -0
  125. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/hashing.py +0 -0
  126. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/uv_sync.py +0 -0
  127. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/validation.py +0 -0
  128. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/__init__.py +0 -0
  129. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/conftest.py +0 -0
  130. {classifyre_cli-0.4.23/src/sources/sqlite → classifyre_cli-0.4.25/tests/detectors}/__init__.py +0 -0
  131. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  132. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/conftest.py +0 -0
  133. {classifyre_cli-0.4.23/src/sources/neo4j → classifyre_cli-0.4.25/tests/detectors/content}/__init__.py +0 -0
  134. {classifyre_cli-0.4.23/src/detectors/threat → classifyre_cli-0.4.25/tests/detectors/custom}/__init__.py +0 -0
  135. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/custom/conftest.py +0 -0
  136. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  137. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/custom/test_llm_runner.py +0 -0
  138. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  139. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/custom/test_regex_runner.py +0 -0
  140. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/custom/test_transformer_runners.py +0 -0
  141. {classifyre_cli-0.4.23/src/detectors/secrets → classifyre_cli-0.4.25/tests/detectors/pii}/__init__.py +0 -0
  142. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/pii/conftest.py +0 -0
  143. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/pii/sample_invoice.pdf +0 -0
  144. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/pii/test_pii_detector.py +0 -0
  145. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  146. {classifyre_cli-0.4.23/src/detectors/pii → classifyre_cli-0.4.25/tests/detectors/secrets}/__init__.py +0 -0
  147. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  148. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  149. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_base_detector.py +0 -0
  150. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  151. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  152. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_detector_pipeline_types.py +0 -0
  153. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_detector_schema_examples.py +0 -0
  154. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_detector_types.py +0 -0
  155. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_phase2_detectors.py +0 -0
  156. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_registry.py +0 -0
  157. {classifyre_cli-0.4.23/src/detectors/content → classifyre_cli-0.4.25/tests/detectors/threat}/__init__.py +0 -0
  158. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/threat/test_code_security_detector.py +0 -0
  159. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/threat/test_yara_detector.py +0 -0
  160. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  161. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/integration/test_wordpress_links_assets.py +0 -0
  162. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/pipeline/test_detector_pipeline.py +0 -0
  163. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/pipeline/test_worker_pool.py +0 -0
  164. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_assets_metadata_catalog.py +0 -0
  165. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_azure_blob_storage_source.py +0 -0
  166. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_base_source_attachment.py +0 -0
  167. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_base_source_sampling.py +0 -0
  168. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_config.py +0 -0
  169. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_confluence_source.py +0 -0
  170. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_custom_extractor.py +0 -0
  171. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_databricks_source.py +0 -0
  172. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_dependency_groups.py +0 -0
  173. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_email_source.py +0 -0
  174. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_google_cloud_storage_source.py +0 -0
  175. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_hashing.py +0 -0
  176. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_hive_source.py +0 -0
  177. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_jira_source.py +0 -0
  178. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_mongodb_source.py +0 -0
  179. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_mssql_source.py +0 -0
  180. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_mysql_source.py +0 -0
  181. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_neo4j_source.py +0 -0
  182. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_notion_source.py +0 -0
  183. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_oracle_source.py +0 -0
  184. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_outputs.py +0 -0
  185. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_postgresql_source.py +0 -0
  186. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_powerbi_source.py +0 -0
  187. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_recipe_normalizer.py +0 -0
  188. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_s3_compatible_storage_source.py +0 -0
  189. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_sandbox_runner.py +0 -0
  190. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_servicedesk_source.py +0 -0
  191. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_slack_source.py +0 -0
  192. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_snowflake_source.py +0 -0
  193. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_source_dependency_groups.py +0 -0
  194. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_sqlite_source.py +0 -0
  195. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_tableau_source.py +0 -0
  196. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_tabular_utils.py +0 -0
  197. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_uv_sync.py +0 -0
  198. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_wordpress_source.py +0 -0
  199. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_youtube_source.py +0 -0
  200. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_youtube_source_integration.py +0 -0
  201. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/utils/test_content_extraction.py +0 -0
  202. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/utils/test_embedded_images.py +0 -0
  203. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/utils/test_file_metadata.py +0 -0
  204. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/utils/test_file_parser.py +0 -0
  205. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/utils/test_file_to_images.py +0 -0
  206. {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/utils/test_transcription.py +0 -0
@@ -1,3 +1,3 @@
1
1
  $ uv sync
2
- Resolved 262 packages in 370ms
2
+ Resolved 262 packages in 244ms
3
3
  Checked 50 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.23
3
+ Version: 0.4.25
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.23",
3
+ "version": "0.4.25",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.23"
3
+ version = "0.4.25"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -2032,7 +2032,9 @@ class CoreInput(BaseModel):
2032
2032
 
2033
2033
 
2034
2034
  class SlackInput(CoreInput):
2035
- type: Literal['SLACK'] = Field('SLACK', description='Type of the asset or source')
2035
+ type: Literal['SLACK'] | None = Field(
2036
+ None, description='Type of the asset or source'
2037
+ )
2036
2038
  required: SlackRequired
2037
2039
  masked: SlackMaskedBotToken | SlackMaskedUserToken | SlackMaskedToken = Field(
2038
2040
  ..., title='SlackMasked'
@@ -2050,7 +2052,9 @@ class SlackInput(CoreInput):
2050
2052
 
2051
2053
 
2052
2054
  class EmailInput(CoreInput):
2053
- type: Literal['EMAIL'] = Field('EMAIL', description='Type of the asset or source')
2055
+ type: Literal['EMAIL'] | None = Field(
2056
+ None, description='Type of the asset or source'
2057
+ )
2054
2058
  required: EmailRequired
2055
2059
  masked: EmailMasked
2056
2060
  optional: EmailOptional | None = None
@@ -2066,8 +2070,8 @@ class EmailInput(CoreInput):
2066
2070
 
2067
2071
 
2068
2072
  class S3CompatibleStorageInput(CoreInput):
2069
- type: Literal['S3_COMPATIBLE_STORAGE'] = Field(
2070
- 'S3_COMPATIBLE_STORAGE', description='Type of the asset or source'
2073
+ type: Literal['S3_COMPATIBLE_STORAGE'] | None = Field(
2074
+ None, description='Type of the asset or source'
2071
2075
  )
2072
2076
  required: S3CompatibleStorageRequired
2073
2077
  masked: S3CompatibleStorageMasked | None = None
@@ -2084,8 +2088,8 @@ class S3CompatibleStorageInput(CoreInput):
2084
2088
 
2085
2089
 
2086
2090
  class AzureBlobStorageInput(CoreInput):
2087
- type: Literal['AZURE_BLOB_STORAGE'] = Field(
2088
- 'AZURE_BLOB_STORAGE', description='Type of the asset or source'
2091
+ type: Literal['AZURE_BLOB_STORAGE'] | None = Field(
2092
+ None, description='Type of the asset or source'
2089
2093
  )
2090
2094
  required: AzureBlobStorageRequired
2091
2095
  masked: AzureBlobStorageMasked | None = None
@@ -2102,8 +2106,8 @@ class AzureBlobStorageInput(CoreInput):
2102
2106
 
2103
2107
 
2104
2108
  class GoogleCloudStorageInput(CoreInput):
2105
- type: Literal['GOOGLE_CLOUD_STORAGE'] = Field(
2106
- 'GOOGLE_CLOUD_STORAGE', description='Type of the asset or source'
2109
+ type: Literal['GOOGLE_CLOUD_STORAGE'] | None = Field(
2110
+ None, description='Type of the asset or source'
2107
2111
  )
2108
2112
  required: GoogleCloudStorageRequired
2109
2113
  masked: GoogleCloudStorageMasked | None = None
@@ -2120,8 +2124,8 @@ class GoogleCloudStorageInput(CoreInput):
2120
2124
 
2121
2125
 
2122
2126
  class WordPressInput(CoreInput):
2123
- type: Literal['WORDPRESS'] = Field(
2124
- 'WORDPRESS', description='Type of the asset or source'
2127
+ type: Literal['WORDPRESS'] | None = Field(
2128
+ None, description='Type of the asset or source'
2125
2129
  )
2126
2130
  required: WordPressRequired
2127
2131
  masked: WordPressMasked
@@ -2138,8 +2142,8 @@ class WordPressInput(CoreInput):
2138
2142
 
2139
2143
 
2140
2144
  class PostgreSQLInput(CoreInput):
2141
- type: Literal['POSTGRESQL'] = Field(
2142
- 'POSTGRESQL', description='Type of the asset or source'
2145
+ type: Literal['POSTGRESQL'] | None = Field(
2146
+ None, description='Type of the asset or source'
2143
2147
  )
2144
2148
  required: PostgreSQLRequired
2145
2149
  masked: PostgreSQLMasked
@@ -2156,7 +2160,9 @@ class PostgreSQLInput(CoreInput):
2156
2160
 
2157
2161
 
2158
2162
  class MySQLInput(CoreInput):
2159
- type: Literal['MYSQL'] = Field('MYSQL', description='Type of the asset or source')
2163
+ type: Literal['MYSQL'] | None = Field(
2164
+ None, description='Type of the asset or source'
2165
+ )
2160
2166
  required: MySQLRequired
2161
2167
  masked: MySQLMasked
2162
2168
  optional: MySQLOptional | None = None
@@ -2172,7 +2178,9 @@ class MySQLInput(CoreInput):
2172
2178
 
2173
2179
 
2174
2180
  class MSSQLInput(CoreInput):
2175
- type: Literal['MSSQL'] = Field('MSSQL', description='Type of the asset or source')
2181
+ type: Literal['MSSQL'] | None = Field(
2182
+ None, description='Type of the asset or source'
2183
+ )
2176
2184
  required: MSSQLRequired
2177
2185
  masked: MSSQLMasked
2178
2186
  optional: MSSQLOptional | None = None
@@ -2188,7 +2196,9 @@ class MSSQLInput(CoreInput):
2188
2196
 
2189
2197
 
2190
2198
  class OracleInput(CoreInput):
2191
- type: Literal['ORACLE'] = Field('ORACLE', description='Type of the asset or source')
2199
+ type: Literal['ORACLE'] | None = Field(
2200
+ None, description='Type of the asset or source'
2201
+ )
2192
2202
  required: OracleRequired
2193
2203
  masked: OracleMasked
2194
2204
  optional: OracleOptional | None = None
@@ -2204,7 +2214,9 @@ class OracleInput(CoreInput):
2204
2214
 
2205
2215
 
2206
2216
  class HiveInput(CoreInput):
2207
- type: Literal['HIVE'] = Field('HIVE', description='Type of the asset or source')
2217
+ type: Literal['HIVE'] | None = Field(
2218
+ None, description='Type of the asset or source'
2219
+ )
2208
2220
  required: HiveRequired
2209
2221
  masked: HiveMasked
2210
2222
  optional: HiveOptional | None = None
@@ -2220,8 +2232,8 @@ class HiveInput(CoreInput):
2220
2232
 
2221
2233
 
2222
2234
  class DatabricksInput(CoreInput):
2223
- type: Literal['DATABRICKS'] = Field(
2224
- 'DATABRICKS', description='Type of the asset or source'
2235
+ type: Literal['DATABRICKS'] | None = Field(
2236
+ None, description='Type of the asset or source'
2225
2237
  )
2226
2238
  required: PersonalAccessToken | ServicePrincipalOAuthM2M | AzureServicePrincipal = (
2227
2239
  Field(..., title='DatabricksRequired')
@@ -2242,8 +2254,8 @@ class DatabricksInput(CoreInput):
2242
2254
 
2243
2255
 
2244
2256
  class SnowflakeInput(CoreInput):
2245
- type: Literal['SNOWFLAKE'] = Field(
2246
- 'SNOWFLAKE', description='Type of the asset or source'
2257
+ type: Literal['SNOWFLAKE'] | None = Field(
2258
+ None, description='Type of the asset or source'
2247
2259
  )
2248
2260
  required: (
2249
2261
  SnowflakeRequiredDefaultAuthenticator
@@ -2270,8 +2282,8 @@ class SnowflakeInput(CoreInput):
2270
2282
 
2271
2283
 
2272
2284
  class MongoDBInput(CoreInput):
2273
- type: Literal['MONGODB'] = Field(
2274
- 'MONGODB', description='Type of the asset or source'
2285
+ type: Literal['MONGODB'] | None = Field(
2286
+ None, description='Type of the asset or source'
2275
2287
  )
2276
2288
  required: MongoDBRequiredAtlas | MongoDBRequiredOnPrem = Field(
2277
2289
  ..., title='MongoDBRequired'
@@ -2398,7 +2410,9 @@ class Neo4jOptional(BaseModel):
2398
2410
 
2399
2411
 
2400
2412
  class Neo4jInput(CoreInput):
2401
- type: Literal['NEO4J'] = Field('NEO4J', description='Type of the asset or source')
2413
+ type: Literal['NEO4J'] | None = Field(
2414
+ None, description='Type of the asset or source'
2415
+ )
2402
2416
  required: Neo4jRequired
2403
2417
  masked: Neo4jMaskedUsernamePassword | Neo4jMaskedNone = Field(
2404
2418
  ..., title='Neo4jMasked'
@@ -2416,8 +2430,8 @@ class Neo4jInput(CoreInput):
2416
2430
 
2417
2431
 
2418
2432
  class PowerBIInput(CoreInput):
2419
- type: Literal['POWERBI'] = Field(
2420
- 'POWERBI', description='Type of the asset or source'
2433
+ type: Literal['POWERBI'] | None = Field(
2434
+ None, description='Type of the asset or source'
2421
2435
  )
2422
2436
  required: PowerBIRequiredServicePrincipal | PowerBIRequiredAccessToken = Field(
2423
2437
  ..., title='PowerBIRequired'
@@ -2438,8 +2452,8 @@ class PowerBIInput(CoreInput):
2438
2452
 
2439
2453
 
2440
2454
  class TableauInput(CoreInput):
2441
- type: Literal['TABLEAU'] = Field(
2442
- 'TABLEAU', description='Type of the asset or source'
2455
+ type: Literal['TABLEAU'] | None = Field(
2456
+ None, description='Type of the asset or source'
2443
2457
  )
2444
2458
  required: TableauRequiredUsernamePassword | TableauRequiredPersonalAccessToken = (
2445
2459
  Field(..., title='TableauRequired')
@@ -2838,8 +2852,8 @@ class Type19(StrEnum):
2838
2852
 
2839
2853
 
2840
2854
  class ConfluenceInput(CoreInput):
2841
- type: Literal['CONFLUENCE'] = Field(
2842
- 'CONFLUENCE', description='Type of the asset or source'
2855
+ type: Literal['CONFLUENCE'] | None = Field(
2856
+ None, description='Type of the asset or source'
2843
2857
  )
2844
2858
  required: ConfluenceRequired
2845
2859
  masked: ConfluenceMasked
@@ -2856,7 +2870,9 @@ class ConfluenceInput(CoreInput):
2856
2870
 
2857
2871
 
2858
2872
  class JiraInput(CoreInput):
2859
- type: Literal['JIRA'] = Field('JIRA', description='Type of the asset or source')
2873
+ type: Literal['JIRA'] | None = Field(
2874
+ None, description='Type of the asset or source'
2875
+ )
2860
2876
  required: JiraRequired
2861
2877
  masked: JiraMasked
2862
2878
  optional: JiraOptional | None = None
@@ -2872,8 +2888,8 @@ class JiraInput(CoreInput):
2872
2888
 
2873
2889
 
2874
2890
  class ServiceDeskInput(CoreInput):
2875
- type: Literal['SERVICEDESK'] = Field(
2876
- 'SERVICEDESK', description='Type of the asset or source'
2891
+ type: Literal['SERVICEDESK'] | None = Field(
2892
+ None, description='Type of the asset or source'
2877
2893
  )
2878
2894
  required: ServiceDeskRequired
2879
2895
  masked: ServiceDeskMasked
@@ -2924,7 +2940,9 @@ class SQLiteOptional(BaseModel):
2924
2940
 
2925
2941
 
2926
2942
  class SQLiteInput(CoreInput):
2927
- type: Literal['SQLITE'] = Field('SQLITE', description='Type of the asset or source')
2943
+ type: Literal['SQLITE'] | None = Field(
2944
+ None, description='Type of the asset or source'
2945
+ )
2928
2946
  required: SQLiteRequired
2929
2947
  masked: dict[str, Any] | None = Field(
2930
2948
  None,
@@ -3057,7 +3075,9 @@ class NotionOptional(BaseModel):
3057
3075
 
3058
3076
 
3059
3077
  class NotionInput(CoreInput):
3060
- type: Literal['NOTION'] = Field('NOTION', description='Type of the asset or source')
3078
+ type: Literal['NOTION'] | None = Field(
3079
+ None, description='Type of the asset or source'
3080
+ )
3061
3081
  required: NotionRequired
3062
3082
  masked: NotionMasked
3063
3083
  optional: NotionOptional | None = None
@@ -3073,8 +3093,8 @@ class NotionInput(CoreInput):
3073
3093
 
3074
3094
 
3075
3095
  class YouTubeInput(CoreInput):
3076
- type: Literal['YOUTUBE'] = Field(
3077
- 'YOUTUBE', description='Type of the asset or source'
3096
+ type: Literal['YOUTUBE'] | None = Field(
3097
+ None, description='Type of the asset or source'
3078
3098
  )
3079
3099
  required: YouTubeRequired
3080
3100
  masked: YouTubeMasked | None = None
@@ -297,13 +297,23 @@ class DetectorPipeline:
297
297
  page_num=page_num,
298
298
  )
299
299
  elapsed = int((time.monotonic() - t0) * 1000)
300
+ snippet = page_content[:120].replace("\n", "\\n") if page_content else ""
300
301
  logger.info(
301
- " %s page %d done: %d findings (%dms)",
302
+ " %s page %d: %d findings in %dms — snippet: %s",
302
303
  asset.name,
303
304
  page_num,
304
305
  len(page_findings),
305
306
  elapsed,
307
+ snippet,
306
308
  )
309
+ if page_findings:
310
+ for f in page_findings[:5]:
311
+ logger.info(
312
+ " finding: type=%s detector=%s matched=%.100s",
313
+ f.finding_type,
314
+ f.detector_type,
315
+ f.matched_content[:100].replace("\n", " "),
316
+ )
307
317
  return page_findings, page_types, page_errors, page_content, page_num
308
318
 
309
319
  def _collect_done() -> None:
@@ -404,13 +414,23 @@ class DetectorPipeline:
404
414
  page_num=page_num,
405
415
  )
406
416
  elapsed = int((time.monotonic() - t0) * 1000)
417
+ snippet = page_content[:120].replace("\n", "\\n") if page_content else ""
407
418
  logger.info(
408
- " %s page %d done: %d findings (%dms)",
419
+ " %s page %d: %d findings in %dms — snippet: %s",
409
420
  asset.name,
410
421
  page_num,
411
422
  len(page_findings),
412
423
  elapsed,
424
+ snippet,
413
425
  )
426
+ if page_findings:
427
+ for f in page_findings[:5]:
428
+ logger.info(
429
+ " finding: type=%s detector=%s matched=%.100s",
430
+ f.finding_type,
431
+ f.detector_type,
432
+ f.matched_content[:100].replace("\n", " "),
433
+ )
414
434
  return page_findings, page_types, page_errors, page_content, page_num
415
435
 
416
436
  async def _collect_done_and_flush(min_findings: int = 1) -> None:
@@ -491,6 +511,12 @@ class DetectorPipeline:
491
511
  continue
492
512
  candidate_ids.append(value)
493
513
 
514
+ logger.info(
515
+ "_iter_text_content_pages(%s): trying candidates %s",
516
+ asset.name,
517
+ candidate_ids,
518
+ )
519
+
494
520
  for candidate_id in candidate_ids:
495
521
  saw_candidate_content = False
496
522
  async for text_content in self.content_provider.fetch_text_pages(candidate_id):
@@ -502,6 +528,16 @@ class DetectorPipeline:
502
528
  if saw_candidate_content:
503
529
  return
504
530
 
531
+ # If fetch_content_pages ran the full bytes-path extraction (even
532
+ # yielding 0 text, e.g. silent audio), the source already did the
533
+ # expensive work. Don't re-process with another candidate ID for
534
+ # the same asset.
535
+ source = getattr(self.content_provider, "_source", None)
536
+ if source is not None:
537
+ processed: set[str] = getattr(source, "_content_pages_processed", set())
538
+ if candidate_id in processed:
539
+ return
540
+
505
541
  async def _run_binary_detectors_for_asset(
506
542
  self,
507
543
  *,
@@ -3,11 +3,14 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import asyncio
6
+ import logging
6
7
  from collections.abc import AsyncGenerator
7
8
 
8
9
  from ..models.generated_single_asset_scan_results import DetectionResult, SingleAssetScanResults
9
10
  from ..sources.base import BaseSource
10
11
 
12
+ logger = logging.getLogger(__name__)
13
+
11
14
 
12
15
  class ParsedContentProvider:
13
16
  """
@@ -32,11 +35,30 @@ class ParsedContentProvider:
32
35
  if saw_text:
33
36
  return
34
37
 
38
+ # If fetch_content_pages already ran the full extraction pipeline for
39
+ # this asset (tracked via _content_pages_processed), skip the fallback
40
+ # iter_asset_pages call. Without this, an all-silence audio file would
41
+ # trigger a redundant second transcription pass.
42
+ pages_processed: set[str] | None = getattr(self._source, "_content_pages_processed", None)
43
+ if isinstance(pages_processed, set) and asset_id in pages_processed:
44
+ logger.info(
45
+ "fetch_text_pages(%s): source already processed, skipping fallback",
46
+ asset_id,
47
+ )
48
+ return
49
+
35
50
  result = await self._source.fetch_content_bytes(asset_id)
36
51
  if result is None:
52
+ logger.info("fetch_text_pages(%s): fetch_content_bytes returned None", asset_id)
37
53
  return
38
54
 
39
55
  raw_bytes, mime = result
56
+ logger.info(
57
+ "fetch_text_pages(%s): fallback iter_asset_pages path (%s, %d bytes)",
58
+ asset_id,
59
+ mime,
60
+ len(raw_bytes),
61
+ )
40
62
  pages: list[str] = await asyncio.to_thread(
41
63
  list,
42
64
  self._source.iter_asset_pages(raw_bytes, mime),
@@ -21,6 +21,9 @@ from concurrent.futures import ProcessPoolExecutor
21
21
  from typing import Any
22
22
 
23
23
  from ..models.generated_single_asset_scan_results import DetectionResult
24
+ from ..utils.resources import get_effective_cpu_count, get_effective_memory_mb
25
+
26
+ __all__ = ["get_effective_cpu_count", "get_effective_memory_mb"]
24
27
 
25
28
  logger = logging.getLogger(__name__)
26
29
 
@@ -130,63 +133,6 @@ def is_io_bound_detector(detector_name: str) -> bool:
130
133
  return detector_name in _IO_BOUND_DETECTORS
131
134
 
132
135
 
133
- def get_effective_cpu_count() -> int:
134
- """Return the number of usable CPUs, respecting cgroup limits (K8s/Docker).
135
-
136
- ``os.cpu_count()`` returns the *host* CPU count, which can be much larger
137
- than what the container is allowed to use. This function reads the cgroup
138
- v2 ``cpu.max`` (or v1 ``cpu.cfs_quota_us``/``cpu.cfs_period_us``) to
139
- determine the actual allocation.
140
- """
141
- try:
142
- data = open("/sys/fs/cgroup/cpu.max").read().strip()
143
- quota_str, period_str = data.split()
144
- if quota_str != "max":
145
- cpus = int(quota_str) / int(period_str)
146
- if cpus >= 0.5:
147
- return max(1, int(cpus))
148
- except (FileNotFoundError, OSError, ValueError):
149
- pass
150
-
151
- try:
152
- quota = int(open("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read().strip())
153
- period = int(open("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read().strip())
154
- if quota > 0 and period > 0:
155
- cpus = quota / period
156
- if cpus >= 0.5:
157
- return max(1, int(cpus))
158
- except (FileNotFoundError, OSError, ValueError):
159
- pass
160
-
161
- return os.cpu_count() or 4
162
-
163
-
164
- def get_effective_memory_mb() -> int:
165
- """Return usable memory in MB, respecting cgroup limits."""
166
- try:
167
- mem_bytes = int(open("/sys/fs/cgroup/memory.max").read().strip())
168
- if mem_bytes < 2**50:
169
- return max(256, mem_bytes // (1024 * 1024))
170
- except (FileNotFoundError, OSError, ValueError):
171
- pass
172
-
173
- try:
174
- mem_bytes = int(open("/sys/fs/cgroup/memory/memory.limit_in_bytes").read().strip())
175
- if mem_bytes < 2**50:
176
- return max(256, mem_bytes // (1024 * 1024))
177
- except (FileNotFoundError, OSError, ValueError):
178
- pass
179
-
180
- try:
181
- for line in open("/proc/meminfo"):
182
- if line.startswith("MemTotal:"):
183
- return max(256, int(line.split()[1]) // 1024)
184
- except (FileNotFoundError, OSError, ValueError):
185
- pass
186
-
187
- return 4096
188
-
189
-
190
136
  def compute_pool_workers(override: int | None = None) -> int:
191
137
  """Compute optimal pool size from actual resource limits.
192
138
 
@@ -135,6 +135,11 @@ class ObjectStorageSourceBase(BaseSource, ABC):
135
135
  # Keyed by both asset_hash and external_url for O(1) lookup from either.
136
136
  self._bytes_cache: dict[str, bytes] = {}
137
137
  self._mime_cache: dict[str, str] = {}
138
+ # asset_ids for which fetch_content_pages ran the full bytes path
139
+ # (even if it produced no text, e.g. all-silence audio). Checked by
140
+ # ParsedContentProvider to skip its fallback iter_asset_pages path,
141
+ # which would otherwise re-run an expensive transcription a second time.
142
+ self._content_pages_processed: set[str] = set()
138
143
  # Child IMAGE assets queued while transforming the current object.
139
144
  self._pending_child_assets: list[SingleAssetScanResults] = []
140
145
 
@@ -302,6 +307,15 @@ class ObjectStorageSourceBase(BaseSource, ABC):
302
307
 
303
308
  return OutputAssetType.OTHER
304
309
 
310
+ @staticmethod
311
+ def _asset_kind_for_asset_type(asset_type: OutputAssetType) -> str:
312
+ mapping: dict[OutputAssetType, str] = {
313
+ OutputAssetType.IMAGE: "image",
314
+ OutputAssetType.AUDIO: "audio",
315
+ OutputAssetType.VIDEO: "video",
316
+ }
317
+ return mapping.get(asset_type, "file")
318
+
305
319
  def _ensure_file_processing_dependencies(self) -> None:
306
320
  if self._file_processing_deps_checked:
307
321
  return
@@ -446,7 +460,7 @@ class ObjectStorageSourceBase(BaseSource, ABC):
446
460
  created_at=ref.last_modified,
447
461
  updated_at=ref.last_modified,
448
462
  runner_id=self.runner_id,
449
- **self.metadata_fields("file", asset_metadata),
463
+ **self.metadata_fields(self._asset_kind_for_asset_type(asset_type), asset_metadata),
450
464
  )
451
465
  self._hash_to_uri[asset_hash] = external_url
452
466
  self._object_ref_by_hash[asset_hash] = ref
@@ -549,6 +563,7 @@ class ObjectStorageSourceBase(BaseSource, ABC):
549
563
  self._object_ref_by_hash = {}
550
564
  self._bytes_cache = {}
551
565
  self._mime_cache = {}
566
+ self._content_pages_processed = set()
552
567
  self._pending_child_assets = []
553
568
 
554
569
  refs = self._list_objects()
@@ -628,26 +643,69 @@ class ObjectStorageSourceBase(BaseSource, ABC):
628
643
  raw_bytes = self._bytes_cache.get(asset_id)
629
644
  mime = self._mime_cache.get(asset_id, "")
630
645
 
646
+ logger.info(
647
+ "fetch_content_pages(%s): raw_bytes=%s mime=%s processed=%s",
648
+ asset_id,
649
+ f"{len(raw_bytes)} bytes" if raw_bytes is not None else "MISS",
650
+ mime or "MISS",
651
+ asset_id in self._content_pages_processed,
652
+ )
653
+
631
654
  if raw_bytes is not None:
632
655
  sampling = self.config.sampling
633
656
  batch_size = int(sampling.rows_per_page or 100)
634
657
  include_col_names = bool(
635
658
  sampling.include_column_names if sampling.include_column_names is not None else True
636
659
  )
637
- # Run the (potentially blocking) file parsing in a thread so pyarrow /
638
- # pdfplumber can't freeze the event loop during large file iteration.
639
- pages: list[str] = await asyncio.to_thread(
640
- list,
641
- self.iter_asset_pages(
642
- raw_bytes,
643
- mime,
644
- batch_size,
645
- include_col_names,
646
- file_name=self._file_name_for_asset_id(asset_id),
647
- ),
660
+ file_name = self._file_name_for_asset_id(asset_id)
661
+
662
+ # Stream pages from a thread instead of materializing via list().
663
+ # For transcription this lets detectors start working on the first
664
+ # chunk while later chunks are still being transcribed.
665
+ loop = asyncio.get_running_loop()
666
+ queue: asyncio.Queue[str | None] = asyncio.Queue()
667
+
668
+ exc_info: list[BaseException | None] = [None]
669
+
670
+ page_count: int = 0
671
+
672
+ def _produce() -> None:
673
+ nonlocal page_count
674
+ try:
675
+ for page in self.iter_asset_pages(
676
+ raw_bytes,
677
+ mime,
678
+ batch_size,
679
+ include_col_names,
680
+ file_name=file_name,
681
+ ):
682
+ loop.call_soon_threadsafe(queue.put_nowait, page)
683
+ page_count += 1
684
+ except BaseException as exc:
685
+ exc_info[0] = exc
686
+ finally:
687
+ loop.call_soon_threadsafe(queue.put_nowait, None)
688
+
689
+ task = loop.run_in_executor(None, _produce)
690
+
691
+ while True:
692
+ page = await queue.get()
693
+ if page is None:
694
+ break
695
+ yield "", page
696
+
697
+ await task
698
+ if exc_info[0] is not None:
699
+ raise exc_info[0] # type: ignore[misc]
700
+
701
+ logger.info(
702
+ "fetch_content_pages(%s): streamed %d page(s) from %s",
703
+ asset_id,
704
+ page_count,
705
+ file_name,
648
706
  )
649
- for batch_text in pages:
650
- yield "", batch_text
707
+
708
+ self._content_pages_processed.add(asset_id)
651
709
  return
652
710
 
653
711
  result = await self.fetch_content(asset_id)
@@ -0,0 +1,65 @@
1
+ """Cgroup-aware CPU and memory introspection.
2
+
3
+ Shared by the detector worker pool (to size the process pool) and the
4
+ transcription pipeline (to select the right Whisper model at runtime).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+
11
+
12
+ def get_effective_cpu_count() -> int:
13
+ """Return usable CPUs, respecting cgroup limits (K8s / Docker).
14
+
15
+ ``os.cpu_count()`` returns the *host* count, which is usually much larger
16
+ than the container's CPU quota. This reads cgroup v2 / v1 to get the
17
+ actual allocation.
18
+ """
19
+ try:
20
+ data = open("/sys/fs/cgroup/cpu.max").read().strip()
21
+ quota_str, period_str = data.split()
22
+ if quota_str != "max":
23
+ cpus = int(quota_str) / int(period_str)
24
+ if cpus >= 0.5:
25
+ return max(1, int(cpus))
26
+ except (FileNotFoundError, OSError, ValueError):
27
+ pass
28
+
29
+ try:
30
+ quota = int(open("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read().strip())
31
+ period = int(open("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read().strip())
32
+ if quota > 0 and period > 0:
33
+ cpus = quota / period
34
+ if cpus >= 0.5:
35
+ return max(1, int(cpus))
36
+ except (FileNotFoundError, OSError, ValueError):
37
+ pass
38
+
39
+ return os.cpu_count() or 4
40
+
41
+
42
+ def get_effective_memory_mb() -> int:
43
+ """Return usable memory in MB, respecting cgroup limits (K8s / Docker)."""
44
+ try:
45
+ mem_bytes = int(open("/sys/fs/cgroup/memory.max").read().strip())
46
+ if mem_bytes < 2**50:
47
+ return max(256, mem_bytes // (1024 * 1024))
48
+ except (FileNotFoundError, OSError, ValueError):
49
+ pass
50
+
51
+ try:
52
+ mem_bytes = int(open("/sys/fs/cgroup/memory/memory.limit_in_bytes").read().strip())
53
+ if mem_bytes < 2**50:
54
+ return max(256, mem_bytes // (1024 * 1024))
55
+ except (FileNotFoundError, OSError, ValueError):
56
+ pass
57
+
58
+ try:
59
+ for line in open("/proc/meminfo"):
60
+ if line.startswith("MemTotal:"):
61
+ return max(256, int(line.split()[1]) // 1024)
62
+ except (FileNotFoundError, OSError, ValueError):
63
+ pass
64
+
65
+ return 4096