classifyre-cli 0.4.10__tar.gz → 0.4.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/.turbo/turbo-build.log +1 -1
  2. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/package.json +1 -1
  4. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/pyproject.toml +8 -2
  5. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_base.py +28 -0
  6. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_image_classification.py +41 -31
  7. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_llm.py +71 -6
  8. classifyre_cli-0.4.12/src/detectors/custom/runners/_object_detection.py +121 -0
  9. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/models/generated_detectors.py +4 -0
  10. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/models/generated_input.py +32 -4
  11. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/outputs/rest.py +4 -0
  12. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/pipeline/detector_pipeline.py +13 -32
  13. classifyre_cli-0.4.12/src/sandbox/runner.py +308 -0
  14. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/databricks/source.py +61 -8
  15. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/object_storage/base.py +81 -5
  16. classifyre_cli-0.4.12/src/utils/embedded_images.py +222 -0
  17. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/utils/file_parser.py +65 -38
  18. classifyre_cli-0.4.12/src/utils/file_to_images.py +134 -0
  19. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/test_llm_runner.py +82 -3
  20. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/test_transformer_runners.py +3 -3
  21. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_outputs.py +1 -0
  22. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_s3_compatible_storage_source.py +68 -0
  23. classifyre_cli-0.4.12/tests/test_sandbox_runner.py +214 -0
  24. classifyre_cli-0.4.12/tests/utils/test_embedded_images.py +129 -0
  25. classifyre_cli-0.4.12/tests/utils/test_file_to_images.py +99 -0
  26. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/uv.lock +363 -271
  27. classifyre_cli-0.4.10/src/detectors/custom/runners/_object_detection.py +0 -107
  28. classifyre_cli-0.4.10/src/sandbox/runner.py +0 -145
  29. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/.gitignore +0 -0
  30. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/.python-version +0 -0
  31. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/README.md +0 -0
  32. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/main.py +0 -0
  33. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/scripts/generate_models.py +0 -0
  34. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/__init__.py +0 -0
  35. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/__init__.py +0 -0
  36. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/base.py +0 -0
  37. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/broken_links/__init__.py +0 -0
  38. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/broken_links/detector.py +0 -0
  39. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/config.py +0 -0
  40. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/content/__init__.py +0 -0
  41. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/__init__.py +0 -0
  42. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/detector.py +0 -0
  43. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/extractor.py +0 -0
  44. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/__init__.py +0 -0
  45. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_factory.py +0 -0
  46. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  47. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_gliner2.py +0 -0
  48. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_regex.py +0 -0
  49. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_text_classification.py +0 -0
  50. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/trainer.py +0 -0
  51. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/dependencies.py +0 -0
  52. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/pii/__init__.py +0 -0
  53. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/pii/detector.py +0 -0
  54. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/secrets/__init__.py +0 -0
  55. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/secrets/detector.py +0 -0
  56. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/threat/__init__.py +0 -0
  57. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/threat/code_security_detector.py +0 -0
  58. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/threat/yara_detector.py +0 -0
  59. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/main.py +0 -0
  60. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/models/generated_single_asset_scan_results.py +0 -0
  61. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/outputs/__init__.py +0 -0
  62. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/outputs/base.py +0 -0
  63. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/outputs/console.py +0 -0
  64. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/outputs/factory.py +0 -0
  65. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/outputs/file.py +0 -0
  66. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/pipeline/__init__.py +0 -0
  67. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/pipeline/content_provider.py +0 -0
  68. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/pipeline/parsed_content_provider.py +0 -0
  69. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/pipeline/worker_pool.py +0 -0
  70. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sandbox/__init__.py +0 -0
  71. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/__init__.py +0 -0
  72. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/atlassian_common.py +0 -0
  73. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/azure_blob_storage/__init__.py +0 -0
  74. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/azure_blob_storage/source.py +0 -0
  75. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/base.py +0 -0
  76. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/confluence/__init__.py +0 -0
  77. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/confluence/source.py +0 -0
  78. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/databricks/__init__.py +0 -0
  79. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/dependencies.py +0 -0
  80. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/google_cloud_storage/__init__.py +0 -0
  81. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/google_cloud_storage/source.py +0 -0
  82. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/hive/__init__.py +0 -0
  83. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/hive/source.py +0 -0
  84. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/jira/__init__.py +0 -0
  85. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/jira/source.py +0 -0
  86. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/mongodb/__init__.py +0 -0
  87. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/mongodb/source.py +0 -0
  88. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/mssql/__init__.py +0 -0
  89. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/mssql/source.py +0 -0
  90. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/mysql/__init__.py +0 -0
  91. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/mysql/source.py +0 -0
  92. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/neo4j/__init__.py +0 -0
  93. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/neo4j/source.py +0 -0
  94. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/oracle/__init__.py +0 -0
  95. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/oracle/source.py +0 -0
  96. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/postgresql/__init__.py +0 -0
  97. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/postgresql/source.py +0 -0
  98. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/powerbi/__init__.py +0 -0
  99. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/powerbi/source.py +0 -0
  100. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/recipe_normalizer.py +0 -0
  101. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/s3_compatible_storage/README.md +0 -0
  102. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/s3_compatible_storage/__init__.py +0 -0
  103. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/s3_compatible_storage/source.py +0 -0
  104. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/servicedesk/__init__.py +0 -0
  105. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/servicedesk/source.py +0 -0
  106. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/slack/__init__.py +0 -0
  107. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/slack/source.py +0 -0
  108. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/snowflake/__init__.py +0 -0
  109. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/snowflake/source.py +0 -0
  110. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/sqlite/__init__.py +0 -0
  111. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/sqlite/source.py +0 -0
  112. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/tableau/__init__.py +0 -0
  113. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/tableau/source.py +0 -0
  114. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/tabular_base.py +0 -0
  115. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/tabular_utils.py +0 -0
  116. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/wordpress/__init__.py +0 -0
  117. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/wordpress/source.py +0 -0
  118. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/telemetry.py +0 -0
  119. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/utils/__init__.py +0 -0
  120. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/utils/content_extraction.py +0 -0
  121. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/utils/hashing.py +0 -0
  122. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/utils/uv_sync.py +0 -0
  123. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/utils/validation.py +0 -0
  124. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/__init__.py +0 -0
  125. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/conftest.py +0 -0
  126. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/__init__.py +0 -0
  127. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  128. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/conftest.py +0 -0
  129. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/content/__init__.py +0 -0
  130. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/__init__.py +0 -0
  131. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/conftest.py +0 -0
  132. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  133. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  134. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/test_regex_runner.py +0 -0
  135. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/pii/__init__.py +0 -0
  136. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/pii/conftest.py +0 -0
  137. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/pii/sample_invoice.pdf +0 -0
  138. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/pii/test_pii_detector.py +0 -0
  139. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  140. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/secrets/__init__.py +0 -0
  141. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  142. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  143. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_base_detector.py +0 -0
  144. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  145. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  146. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_detector_pipeline_types.py +0 -0
  147. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_detector_schema_examples.py +0 -0
  148. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_detector_types.py +0 -0
  149. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_phase2_detectors.py +0 -0
  150. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_registry.py +0 -0
  151. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/threat/__init__.py +0 -0
  152. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/threat/test_code_security_detector.py +0 -0
  153. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/threat/test_yara_detector.py +0 -0
  154. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  155. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/integration/test_wordpress_links_assets.py +0 -0
  156. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/pipeline/test_detector_pipeline.py +0 -0
  157. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/pipeline/test_worker_pool.py +0 -0
  158. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_azure_blob_storage_source.py +0 -0
  159. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_base_source_attachment.py +0 -0
  160. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_base_source_sampling.py +0 -0
  161. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_confluence_source.py +0 -0
  162. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_custom_extractor.py +0 -0
  163. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_databricks_source.py +0 -0
  164. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_google_cloud_storage_source.py +0 -0
  165. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_hashing.py +0 -0
  166. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_hive_source.py +0 -0
  167. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_jira_source.py +0 -0
  168. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_mongodb_source.py +0 -0
  169. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_mssql_source.py +0 -0
  170. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_mysql_source.py +0 -0
  171. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_neo4j_source.py +0 -0
  172. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_oracle_source.py +0 -0
  173. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_postgresql_source.py +0 -0
  174. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_powerbi_source.py +0 -0
  175. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_recipe_normalizer.py +0 -0
  176. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_servicedesk_source.py +0 -0
  177. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_slack_source.py +0 -0
  178. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_snowflake_source.py +0 -0
  179. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_source_dependency_groups.py +0 -0
  180. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_sqlite_source.py +0 -0
  181. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_tableau_source.py +0 -0
  182. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_tabular_utils.py +0 -0
  183. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_wordpress_source.py +0 -0
  184. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/utils/test_content_extraction.py +0 -0
  185. {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/utils/test_file_parser.py +0 -0
@@ -1,3 +1,3 @@
1
1
  $ uv sync
2
- Resolved 265 packages in 156ms
2
+ Resolved 267 packages in 181ms
3
3
  Checked 50 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.10
3
+ Version: 0.4.12
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.10",
3
+ "version": "0.4.12",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.10"
3
+ version = "0.4.12"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -47,7 +47,7 @@ privacy = [
47
47
  # mid-run in frozen/venv contexts. 8.x eagerly loads all data at import time,
48
48
  # avoiding ModuleNotFoundError during Presidio phone number analysis.
49
49
  "phonenumbers>=8.13.0,<10.0.0",
50
- "numpy>=1.26.0,<2.0.0",
50
+ "numpy>=1.26.0,<3.0.0",
51
51
  ]
52
52
  security = [
53
53
  "detect-secrets>=1.5.0",
@@ -93,6 +93,10 @@ regex = [
93
93
  ]
94
94
  llm = [
95
95
  "litellm>=1.86.2",
96
+ # Pure-wheel PDF renderer (permissive license, no system binaries) used to
97
+ # rasterise PDF pages to images for vision-capable LLM detectors.
98
+ "pypdfium2>=4.30.0",
99
+ "pillow>=12.2.0",
96
100
  ]
97
101
  detectors = [
98
102
  { include-group = "file-processing" },
@@ -270,6 +274,8 @@ module = [
270
274
  "setfit",
271
275
  "litellm.*",
272
276
  "litellm",
277
+ "pypdfium2.*",
278
+ "pypdfium2",
273
279
  "sklearn.*",
274
280
  "sklearn",
275
281
  "numpy",
@@ -2,6 +2,8 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import io
6
+ import logging
5
7
  import re
6
8
  from abc import ABC, abstractmethod
7
9
  from datetime import UTC, datetime
@@ -38,6 +40,32 @@ _IMAGE_CONTENT_TYPES = [
38
40
  "image/bmp",
39
41
  "image/tiff",
40
42
  ]
43
+ # Content types HuggingFace image detectors accept. Non-image renderable files
44
+ # (PDFs) are rasterised page-by-page via render_to_images before classification,
45
+ # mirroring the vision LLM detector's input handling.
46
+ _IMAGE_INPUT_CONTENT_TYPES = [*_IMAGE_CONTENT_TYPES, "application/pdf"]
47
+
48
+ logger = logging.getLogger(__name__)
49
+
50
+
51
+ def _load_input_images(content: bytes, content_type: str, pil: Any) -> list[tuple[int, Any]]:
52
+ """Return ``(page_index, PIL.Image)`` tuples for an image or renderable file.
53
+
54
+ Image MIME types open directly; PDFs (and any type ``render_to_images`` supports)
55
+ are rasterised to one image per page. Unsupported types return ``[]``.
56
+ """
57
+ from ....utils.file_to_images import render_to_images, supported_mime_type
58
+
59
+ normalized = content_type.split(";", 1)[0].strip().lower()
60
+ try:
61
+ if normalized.startswith("image/"):
62
+ return [(0, pil.open(io.BytesIO(content)))]
63
+ if supported_mime_type(content_type):
64
+ pages = render_to_images(content, content_type)
65
+ return [(idx, pil.open(io.BytesIO(png))) for idx, png in enumerate(pages)]
66
+ except Exception as exc: # pragma: no cover - defensive
67
+ logger.warning("Failed to load input images (%s): %s", normalized, exc)
68
+ return []
41
69
 
42
70
 
43
71
  def _resolve_pipeline_severity(
@@ -2,7 +2,6 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import io
6
5
  import logging
7
6
  from typing import Any
8
7
 
@@ -11,8 +10,9 @@ from ....models.generated_single_asset_scan_results import DetectionResult
11
10
  from ...dependencies import ensure_torch, require_module
12
11
  from ._base import (
13
12
  _DEFAULT_IMAGE_CLASSIFICATION_MODEL,
14
- _IMAGE_CONTENT_TYPES,
13
+ _IMAGE_INPUT_CONTENT_TYPES,
15
14
  BaseRunner,
15
+ _load_input_images,
16
16
  _resolve_pipeline_severity,
17
17
  )
18
18
 
@@ -54,45 +54,55 @@ class ImageClassificationRunner(BaseRunner):
54
54
  raise NotImplementedError("ImageClassificationRunner uses detect() directly")
55
55
 
56
56
  def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
57
- if not content_type.startswith("image/"):
58
- return []
59
57
  if isinstance(content, str):
60
58
  logger.warning("image_classification: received string content, expected bytes")
61
59
  return []
62
60
 
61
+ # image/* opens directly; PDFs are rasterised to one image per page.
62
+ images = _load_input_images(content, content_type, self._pil)
63
+ if not images:
64
+ return []
65
+
63
66
  schema = self._schema
64
67
  threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.0
68
+ multi_page = len(images) > 1
65
69
  results: list[DetectionResult] = []
66
- try:
67
- image = self._pil.open(io.BytesIO(content))
68
- predictions: list[dict[str, Any]] = self._pipe(image) or []
69
- for pred in predictions:
70
- label: str = pred.get("label", "unknown")
71
- score: float = float(pred.get("score", 0.0))
72
- if score < threshold:
73
- continue
74
- severity = _resolve_pipeline_severity(label, schema.severity_map)
75
- results.append(
76
- self._make_result(
77
- finding_type=f"classification:{label}",
78
- category="CONTENT",
79
- severity=severity,
80
- confidence=score,
81
- matched_content=f"Image classified as: {label} ({score:.3f})",
82
- location=None,
83
- metadata={
84
- "image_size": f"{image.size[0]}x{image.size[1]}",
85
- "image_mode": image.mode,
86
- "model": self._model_id,
87
- },
70
+ for page_index, image in images:
71
+ try:
72
+ predictions: list[dict[str, Any]] = self._pipe(image) or []
73
+ for pred in predictions:
74
+ label: str = pred.get("label", "unknown")
75
+ score: float = float(pred.get("score", 0.0))
76
+ if score < threshold:
77
+ continue
78
+ severity = _resolve_pipeline_severity(label, schema.severity_map)
79
+ page_suffix = f" (page {page_index + 1})" if multi_page else ""
80
+ metadata: dict[str, Any] = {
81
+ "image_size": f"{image.size[0]}x{image.size[1]}",
82
+ "image_mode": image.mode,
83
+ "model": self._model_id,
84
+ }
85
+ if multi_page:
86
+ metadata["page"] = page_index + 1
87
+ results.append(
88
+ self._make_result(
89
+ finding_type=f"classification:{label}",
90
+ category="CONTENT",
91
+ severity=severity,
92
+ confidence=score,
93
+ matched_content=(
94
+ f"Image classified as: {label} ({score:.3f}){page_suffix}"
95
+ ),
96
+ location=None,
97
+ metadata=metadata,
98
+ )
88
99
  )
100
+ except Exception as exc:
101
+ logger.error(
102
+ "image_classification error (model=%s): %s", self._model_id, exc, exc_info=True
89
103
  )
90
- except Exception as exc:
91
- logger.error(
92
- "image_classification error (model=%s): %s", self._model_id, exc, exc_info=True
93
- )
94
104
  results.sort(key=lambda r: r.confidence, reverse=True)
95
105
  return results
96
106
 
97
107
  def get_supported_content_types(self) -> list[str]:
98
- return list(_IMAGE_CONTENT_TYPES)
108
+ return list(_IMAGE_INPUT_CONTENT_TYPES)
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import base64
5
6
  import json
6
7
  import logging
7
8
  import os
@@ -17,8 +18,9 @@ from ....models.generated_single_asset_scan_results import (
17
18
  DetectionResult,
18
19
  DetectorType,
19
20
  )
21
+ from ....utils.file_to_images import render_to_images, supported_mime_type
20
22
  from ...dependencies import require_module
21
- from ._base import _TEXT_CONTENT_TYPES, BaseRunner, _resolve_pipeline_severity
23
+ from ._base import _IMAGE_CONTENT_TYPES, _TEXT_CONTENT_TYPES, BaseRunner, _resolve_pipeline_severity
22
24
 
23
25
  logger = logging.getLogger(__name__)
24
26
 
@@ -29,6 +31,14 @@ _PROVIDER_PREFIX: dict[str, str] = {
29
31
  "OPENAI_COMPATIBLE": "openai",
30
32
  }
31
33
 
34
+ # Content types a vision-capable LLM detector renders to images and sends to the
35
+ # model directly. PDFs are rasterised page-by-page; images pass through.
36
+ _VISION_CONTENT_TYPES = [*_IMAGE_CONTENT_TYPES, "application/pdf"]
37
+
38
+ # Cap the number of rendered page images sent in a single completion to bound
39
+ # token cost and request size for multi-page PDFs.
40
+ _MAX_VISION_IMAGES = 20
41
+
32
42
 
33
43
  class LLMRunner(BaseRunner):
34
44
  """AI detector — sends content to a configured LLM provider for classification + extraction."""
@@ -60,7 +70,7 @@ class LLMRunner(BaseRunner):
60
70
 
61
71
  def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
62
72
  if isinstance(content, bytes):
63
- return []
73
+ return self._detect_vision(content, content_type)
64
74
  if content_type not in _TEXT_CONTENT_TYPES:
65
75
  return []
66
76
  text = content.strip()
@@ -75,7 +85,48 @@ class LLMRunner(BaseRunner):
75
85
  {"role": "system", "content": self._build_system_prompt()},
76
86
  {"role": "user", "content": snippet},
77
87
  ]
88
+ return self._complete_and_parse(messages, snippet)
89
+
90
+ def _detect_vision(self, content: bytes, content_type: str) -> list[DetectionResult]:
91
+ """Render a binary file (image/PDF) to images and classify via the model."""
92
+ if not self._vision_enabled():
93
+ return []
94
+ if not supported_mime_type(content_type):
95
+ return []
78
96
 
97
+ images = render_to_images(
98
+ content,
99
+ content_type,
100
+ max_pages=_MAX_VISION_IMAGES,
101
+ )
102
+ if not images:
103
+ return []
104
+
105
+ image_blocks = [
106
+ {
107
+ "type": "image_url",
108
+ "image_url": {
109
+ "url": f"data:image/png;base64,{base64.b64encode(png).decode('ascii')}"
110
+ },
111
+ }
112
+ for png in images[:_MAX_VISION_IMAGES]
113
+ ]
114
+ messages = [
115
+ {"role": "system", "content": self._build_system_prompt()},
116
+ {"role": "user", "content": image_blocks},
117
+ ]
118
+ # matched_content fallback descriptor — there is no text snippet for files.
119
+ descriptor = f"[{content_type}, {len(image_blocks)} page image(s)]"
120
+ return self._complete_and_parse(messages, descriptor, vision_pages=len(image_blocks))
121
+
122
+ def _complete_and_parse(
123
+ self,
124
+ messages: list[dict[str, Any]],
125
+ snippet: str,
126
+ *,
127
+ vision_pages: int | None = None,
128
+ ) -> list[DetectionResult]:
129
+ schema = self._schema
79
130
  try:
80
131
  response = self._litellm.completion(
81
132
  model=self._model_string(),
@@ -98,10 +149,16 @@ class LLMRunner(BaseRunner):
98
149
  )
99
150
  return []
100
151
 
101
- return self._results_from_payload(snippet, parsed)
152
+ return self._results_from_payload(snippet, parsed, vision_pages=vision_pages)
153
+
154
+ def _vision_enabled(self) -> bool:
155
+ return bool(getattr(self._runtime, "supports_vision", False))
102
156
 
103
157
  def get_supported_content_types(self) -> list[str]:
104
- return list(_TEXT_CONTENT_TYPES)
158
+ types = list(_TEXT_CONTENT_TYPES)
159
+ if self._vision_enabled():
160
+ types.extend(_VISION_CONTENT_TYPES)
161
+ return types
105
162
 
106
163
  # ── Internals ────────────────────────────────────────────────────────────
107
164
 
@@ -175,7 +232,13 @@ class LLMRunner(BaseRunner):
175
232
  return {}
176
233
  return parsed if isinstance(parsed, dict) else {}
177
234
 
178
- def _results_from_payload(self, snippet: str, payload: dict[str, Any]) -> list[DetectionResult]:
235
+ def _results_from_payload(
236
+ self,
237
+ snippet: str,
238
+ payload: dict[str, Any],
239
+ *,
240
+ vision_pages: int | None = None,
241
+ ) -> list[DetectionResult]:
179
242
  schema = self._schema
180
243
  threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
181
244
  default_severity = schema.severity or Severity.info
@@ -201,7 +264,7 @@ class LLMRunner(BaseRunner):
201
264
  results.append(
202
265
  DetectionResult(
203
266
  detector_type=DetectorType.CUSTOM,
204
- finding_type=f"llm:{label}",
267
+ finding_type=label,
205
268
  category="CLASSIFICATION",
206
269
  severity=severity,
207
270
  confidence=min(0.99, confidence),
@@ -216,6 +279,8 @@ class LLMRunner(BaseRunner):
216
279
  "model": self._runtime.model,
217
280
  "label": label,
218
281
  "fields": extracted,
282
+ "input": "vision" if vision_pages is not None else "text",
283
+ **({"vision_pages": vision_pages} if vision_pages is not None else {}),
219
284
  },
220
285
  extracted_data=extracted or None,
221
286
  extraction_method="LLM",
@@ -0,0 +1,121 @@
1
+ """Object detection pipeline runner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from ....models.generated_detectors import ObjectDetectionPipelineSchema
9
+ from ....models.generated_single_asset_scan_results import DetectionResult, Location
10
+ from ...dependencies import MissingDependencyError, ensure_torch, require_module
11
+ from ._base import (
12
+ _IMAGE_INPUT_CONTENT_TYPES,
13
+ BaseRunner,
14
+ _load_input_images,
15
+ _resolve_pipeline_severity,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class ObjectDetectionRunner(BaseRunner):
22
+ """Object detection via a single HuggingFace object-detection pipeline."""
23
+
24
+ def __init__(
25
+ self,
26
+ schema: ObjectDetectionPipelineSchema,
27
+ detector_key: str = "",
28
+ detector_name: str = "",
29
+ ) -> None:
30
+ self._schema = schema
31
+ self._detector_key = detector_key
32
+ self._detector_name = detector_name
33
+ ensure_torch("object_detection", ["custom", "detectors"])
34
+ transformers = require_module("transformers", "object_detection", ["custom", "detectors"])
35
+ self._pil = require_module("PIL.Image", "object_detection", ["custom", "detectors"])
36
+ pipeline_kwargs: dict[str, Any] = {
37
+ "model": schema.model,
38
+ "device": schema.device or "cpu",
39
+ }
40
+ if schema.model_revision:
41
+ pipeline_kwargs["revision"] = schema.model_revision
42
+ nms = getattr(schema.nms_threshold, "root", schema.nms_threshold)
43
+ if nms is not None:
44
+ pipeline_kwargs["threshold"] = nms
45
+ try:
46
+ self._pipe: Any = transformers.pipeline("object-detection", **pipeline_kwargs)
47
+ except ImportError as exc:
48
+ raise MissingDependencyError(
49
+ "object_detection",
50
+ ["custom", "detectors"],
51
+ f"ObjectDetectionRunner requires additional dependencies: {exc}",
52
+ ) from exc
53
+
54
+ def run(self, text: str) -> None: # type: ignore[override] # pragma: no cover
55
+ raise NotImplementedError("ObjectDetectionRunner uses detect() directly")
56
+
57
+ def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
58
+ if isinstance(content, str):
59
+ logger.warning("object_detection: received string content, expected bytes")
60
+ return []
61
+
62
+ # image/* opens directly; PDFs are rasterised to one image per page.
63
+ images = _load_input_images(content, content_type, self._pil)
64
+ if not images:
65
+ return []
66
+
67
+ schema = self._schema
68
+ threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
69
+ multi_page = len(images) > 1
70
+ results: list[DetectionResult] = []
71
+ for page_index, image in images:
72
+ try:
73
+ detections: list[dict[str, Any]] = self._pipe(image) or []
74
+ for det in detections:
75
+ label: str = det.get("label", "unknown")
76
+ score: float = float(det.get("score", 0.0))
77
+ box: dict[str, int] = det.get("box", {})
78
+ if score < threshold:
79
+ continue
80
+ if schema.min_box_area is not None:
81
+ w = max(0, box.get("xmax", 0) - box.get("xmin", 0))
82
+ h = max(0, box.get("ymax", 0) - box.get("ymin", 0))
83
+ if w * h < schema.min_box_area:
84
+ continue
85
+ severity = _resolve_pipeline_severity(label, schema.severity_map)
86
+ page_prefix = f"page {page_index + 1} " if multi_page else ""
87
+ metadata: dict[str, Any] = {
88
+ "box": box,
89
+ "score": score,
90
+ "image_size": f"{image.size[0]}x{image.size[1]}",
91
+ "model": schema.model,
92
+ }
93
+ if multi_page:
94
+ metadata["page"] = page_index + 1
95
+ results.append(
96
+ self._make_result(
97
+ finding_type=label,
98
+ category="CONTENT",
99
+ severity=severity,
100
+ confidence=score,
101
+ matched_content=label,
102
+ location=Location(
103
+ description=(
104
+ f"{page_prefix}box xmin={box.get('xmin')} ymin={box.get('ymin')}"
105
+ f" xmax={box.get('xmax')} ymax={box.get('ymax')}"
106
+ ),
107
+ ),
108
+ metadata=metadata,
109
+ )
110
+ )
111
+ except Exception as exc:
112
+ logger.error(
113
+ "object_detection error (model=%s): %s", schema.model, exc, exc_info=True
114
+ )
115
+ results.sort(key=lambda r: r.confidence, reverse=True)
116
+ if schema.top_k is not None:
117
+ results = results[: schema.top_k]
118
+ return results
119
+
120
+ def get_supported_content_types(self) -> list[str]:
121
+ return list(_IMAGE_INPUT_CONTENT_TYPES)
@@ -1027,6 +1027,10 @@ class LLMProviderRuntime(BaseModel):
1027
1027
  context_size: int | None = Field(
1028
1028
  None, description='Optional context window size configured for the provider.'
1029
1029
  )
1030
+ supports_vision: bool | None = Field(
1031
+ False,
1032
+ description='Whether the resolved provider/model accepts image/PDF input. When true the detector renders supported files to images and sends them to the model as multimodal input instead of extracting text.',
1033
+ )
1030
1034
 
1031
1035
 
1032
1036
  class Type4(StrEnum):
@@ -1078,9 +1078,10 @@ class DatabricksAuthMode(StrEnum):
1078
1078
 
1079
1079
  PAT_TOKEN = 'PAT_TOKEN'
1080
1080
  SERVICE_PRINCIPAL = 'SERVICE_PRINCIPAL'
1081
+ AZURE_SERVICE_PRINCIPAL = 'AZURE_SERVICE_PRINCIPAL'
1081
1082
 
1082
1083
 
1083
- class DatabricksRequiredPat(BaseModel):
1084
+ class PersonalAccessToken(BaseModel):
1084
1085
  model_config = ConfigDict(
1085
1086
  extra='forbid',
1086
1087
  )
@@ -1094,7 +1095,7 @@ class DatabricksRequiredPat(BaseModel):
1094
1095
  )
1095
1096
 
1096
1097
 
1097
- class DatabricksRequiredServicePrincipal(BaseModel):
1098
+ class ServicePrincipalOAuthM2M(BaseModel):
1098
1099
  model_config = ConfigDict(
1099
1100
  extra='forbid',
1100
1101
  )
@@ -1109,6 +1110,24 @@ class DatabricksRequiredServicePrincipal(BaseModel):
1109
1110
  client_id: str = Field(..., description='Databricks service principal client ID')
1110
1111
 
1111
1112
 
1113
+ class AzureServicePrincipal(BaseModel):
1114
+ model_config = ConfigDict(
1115
+ extra='forbid',
1116
+ )
1117
+ auth_mode: Literal['AZURE_SERVICE_PRINCIPAL']
1118
+ workspace_url: AnyUrl = Field(
1119
+ ...,
1120
+ description='Azure Databricks workspace URL (for example, https://adb-1234567890123456.7.azuredatabricks.net)',
1121
+ )
1122
+ warehouse_id: str = Field(
1123
+ ..., description='Databricks SQL warehouse ID used for sampling queries'
1124
+ )
1125
+ client_id: str = Field(
1126
+ ..., description='Azure AD application (client) ID for the service principal'
1127
+ )
1128
+ tenant_id: str = Field(..., description='Azure AD tenant ID')
1129
+
1130
+
1112
1131
  class DatabricksMaskedPat(BaseModel):
1113
1132
  model_config = ConfigDict(
1114
1133
  extra='forbid',
@@ -1125,6 +1144,15 @@ class DatabricksMaskedServicePrincipal(BaseModel):
1125
1144
  )
1126
1145
 
1127
1146
 
1147
+ class DatabricksMaskedAzureServicePrincipal(BaseModel):
1148
+ model_config = ConfigDict(
1149
+ extra='forbid',
1150
+ )
1151
+ client_secret: str = Field(
1152
+ ..., description='Azure AD client secret for the service principal'
1153
+ )
1154
+
1155
+
1128
1156
  class DatabricksOptionalConnection(BaseModel):
1129
1157
  """
1130
1158
  Databricks API and SQL statement execution tuning options.
@@ -2020,8 +2048,8 @@ class DatabricksInput(CoreInput):
2020
2048
  type: Literal['DATABRICKS'] = Field(
2021
2049
  'DATABRICKS', description='Type of the asset or source'
2022
2050
  )
2023
- required: DatabricksRequiredPat | DatabricksRequiredServicePrincipal = Field(
2024
- ..., title='DatabricksRequired'
2051
+ required: PersonalAccessToken | ServicePrincipalOAuthM2M | AzureServicePrincipal = (
2052
+ Field(..., title='DatabricksRequired')
2025
2053
  )
2026
2054
  masked: DatabricksMaskedPat | DatabricksMaskedServicePrincipal = Field(
2027
2055
  ..., title='DatabricksMasked'
@@ -131,6 +131,10 @@ class RestOutputSink:
131
131
  self.base_url = base_url.rstrip("/")
132
132
  self.timeout_sec = timeout_sec
133
133
  self.session = requests.Session()
134
+ # Disable keep-alive so stale pooled connections are never reused after
135
+ # a pod restart or server-side keep-alive timeout. Each request opens
136
+ # a fresh TCP connection, which is cheap enough for our batch cadence.
137
+ self.session.headers.update({"Connection": "close"})
134
138
  adapter = HTTPAdapter(max_retries=_RETRY_POLICY)
135
139
  self.session.mount("http://", adapter)
136
140
  self.session.mount("https://", adapter)
@@ -410,7 +410,7 @@ class DetectorPipeline:
410
410
  )
411
411
  return page_findings, page_types, page_errors, page_content, page_num
412
412
 
413
- async def _collect_done_and_flush() -> None:
413
+ async def _collect_done_and_flush(min_findings: int = 1) -> None:
414
414
  nonlocal detector_types_run, unflushed_count
415
415
  done = {t for t in pending_tasks if t.done()}
416
416
  for task in done:
@@ -430,7 +430,7 @@ class DetectorPipeline:
430
430
  )
431
431
  unflushed_count += len(page_findings)
432
432
 
433
- if unflushed_count >= findings_flush_size and unflushed_count > 0:
433
+ if unflushed_count >= min_findings and unflushed_count > 0:
434
434
  logger.debug(
435
435
  " %s flushing %d findings (%d total)",
436
436
  asset.name,
@@ -449,36 +449,17 @@ class DetectorPipeline:
449
449
  if not text_content:
450
450
  continue
451
451
 
452
+ # Bound the number of detector tasks in flight. While the buffer is
453
+ # full we batch flushes by ``findings_flush_size`` to avoid hammering
454
+ # the API when pages pile up faster than detectors can drain them.
452
455
  while len(pending_tasks) >= max_pending:
453
- done, pending_tasks_set = await asyncio.wait(
454
- pending_tasks,
455
- return_when=asyncio.FIRST_COMPLETED,
456
- )
457
- pending_tasks = pending_tasks_set
458
- for task in done:
459
- page_findings, page_types, page_errors, page_content, _pn = task.result()
460
- for finding in page_findings:
461
- self.content_provider.enrich_finding_location(
462
- finding,
463
- asset,
464
- page_content,
465
- )
466
- findings.extend(page_findings)
467
- errors.extend(page_errors)
468
- detector_types_run = self._merge_detector_types(
469
- detector_types_run,
470
- page_types,
471
- )
472
- unflushed_count += len(page_findings)
473
- if unflushed_count >= findings_flush_size and unflushed_count > 0:
474
- logger.info(
475
- " %s flushing %d findings (%d total)",
476
- asset.name,
477
- unflushed_count,
478
- len(findings),
479
- )
480
- await on_findings_flushed(list(findings))
481
- unflushed_count = 0
456
+ await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
457
+ await _collect_done_and_flush(findings_flush_size)
458
+
459
+ # Steady state: flush findings from any page that has already
460
+ # finished as soon as they are available, so real findings stream to
461
+ # the API per page instead of only once the whole asset is processed.
462
+ await _collect_done_and_flush()
482
463
 
483
464
  task = asyncio.create_task(_detect_page(text_content, page_index))
484
465
  pending_tasks.add(task)
@@ -652,7 +633,7 @@ class DetectorPipeline:
652
633
  detected_at = datetime.now(UTC)
653
634
 
654
635
  for i, (detector, result) in enumerate(zip(runnable_detectors, results, strict=False)):
655
- detector_name = detector.__class__.__name__
636
+ detector_name = self._detector_log_label(detector)
656
637
  via = task_via[i]
657
638
  loc = f"{asset_name}:{page_tag}" if page_tag else asset_name
658
639