classifyre-cli 0.4.2__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. classifyre_cli-0.4.3/.turbo/turbo-build.log +3 -0
  2. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/package.json +1 -1
  4. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/pyproject.toml +1 -1
  5. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/detector.py +6 -0
  6. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/secrets/detector.py +3 -0
  7. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/threat/code_security_detector.py +3 -0
  8. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/threat/yara_detector.py +8 -0
  9. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/main.py +81 -75
  10. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/models/generated_input.py +3 -9
  11. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/pipeline/detector_pipeline.py +333 -187
  12. classifyre_cli-0.4.3/src/pipeline/worker_pool.py +287 -0
  13. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/azure_blob_storage/source.py +3 -6
  14. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/confluence/source.py +0 -7
  15. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/google_cloud_storage/source.py +2 -8
  16. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/jira/source.py +0 -7
  17. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/object_storage/base.py +3 -16
  18. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/s3_compatible_storage/source.py +3 -10
  19. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/servicedesk/source.py +0 -7
  20. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/pipeline/test_detector_pipeline.py +8 -8
  21. classifyre_cli-0.4.3/tests/pipeline/test_worker_pool.py +479 -0
  22. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_azure_blob_storage_source.py +0 -1
  23. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_google_cloud_storage_source.py +0 -1
  24. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_s3_compatible_storage_source.py +2 -5
  25. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/uv.lock +6 -2
  26. classifyre_cli-0.4.2/.turbo/turbo-build.log +0 -3
  27. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/.gitignore +0 -0
  28. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/.python-version +0 -0
  29. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/README.md +0 -0
  30. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/main.py +0 -0
  31. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/scripts/generate_models.py +0 -0
  32. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/__init__.py +0 -0
  33. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/__init__.py +0 -0
  34. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/base.py +0 -0
  35. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/broken_links/__init__.py +0 -0
  36. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/broken_links/detector.py +0 -0
  37. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/config.py +0 -0
  38. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/content/__init__.py +0 -0
  39. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/__init__.py +0 -0
  40. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/runners/__init__.py +0 -0
  41. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/runners/_base.py +0 -0
  42. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/runners/_factory.py +0 -0
  43. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  44. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/runners/_gliner2.py +0 -0
  45. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/runners/_image_classification.py +0 -0
  46. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/runners/_llm.py +0 -0
  47. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/runners/_object_detection.py +0 -0
  48. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/runners/_regex.py +0 -0
  49. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/runners/_text_classification.py +0 -0
  50. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/custom/trainer.py +0 -0
  51. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/dependencies.py +0 -0
  52. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/pii/__init__.py +0 -0
  53. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/pii/detector.py +0 -0
  54. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/secrets/__init__.py +0 -0
  55. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/detectors/threat/__init__.py +0 -0
  56. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/models/generated_detectors.py +0 -0
  57. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/models/generated_single_asset_scan_results.py +0 -0
  58. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/outputs/__init__.py +0 -0
  59. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/outputs/base.py +0 -0
  60. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/outputs/console.py +0 -0
  61. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/outputs/factory.py +0 -0
  62. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/outputs/file.py +0 -0
  63. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/outputs/rest.py +0 -0
  64. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/pipeline/__init__.py +0 -0
  65. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/pipeline/content_provider.py +0 -0
  66. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/pipeline/parsed_content_provider.py +0 -0
  67. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sandbox/__init__.py +0 -0
  68. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sandbox/runner.py +0 -0
  69. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/__init__.py +0 -0
  70. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/atlassian_common.py +0 -0
  71. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/azure_blob_storage/__init__.py +0 -0
  72. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/base.py +0 -0
  73. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/confluence/__init__.py +0 -0
  74. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/databricks/__init__.py +0 -0
  75. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/databricks/source.py +0 -0
  76. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/dependencies.py +0 -0
  77. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/google_cloud_storage/__init__.py +0 -0
  78. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/hive/__init__.py +0 -0
  79. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/hive/source.py +0 -0
  80. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/jira/__init__.py +0 -0
  81. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/mongodb/__init__.py +0 -0
  82. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/mongodb/source.py +0 -0
  83. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/mssql/__init__.py +0 -0
  84. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/mssql/source.py +0 -0
  85. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/mysql/__init__.py +0 -0
  86. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/mysql/source.py +0 -0
  87. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/neo4j/__init__.py +0 -0
  88. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/neo4j/source.py +0 -0
  89. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/oracle/__init__.py +0 -0
  90. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/oracle/source.py +0 -0
  91. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/postgresql/__init__.py +0 -0
  92. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/postgresql/source.py +0 -0
  93. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/powerbi/__init__.py +0 -0
  94. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/powerbi/source.py +0 -0
  95. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/recipe_normalizer.py +0 -0
  96. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/s3_compatible_storage/README.md +0 -0
  97. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/s3_compatible_storage/__init__.py +0 -0
  98. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/servicedesk/__init__.py +0 -0
  99. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/slack/__init__.py +0 -0
  100. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/slack/source.py +0 -0
  101. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/snowflake/__init__.py +0 -0
  102. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/snowflake/source.py +0 -0
  103. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/tableau/__init__.py +0 -0
  104. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/tableau/source.py +0 -0
  105. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/tabular_utils.py +0 -0
  106. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/wordpress/__init__.py +0 -0
  107. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/sources/wordpress/source.py +0 -0
  108. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/telemetry.py +0 -0
  109. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/utils/__init__.py +0 -0
  110. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/utils/content_extraction.py +0 -0
  111. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/utils/file_parser.py +0 -0
  112. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/utils/hashing.py +0 -0
  113. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/utils/uv_sync.py +0 -0
  114. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/src/utils/validation.py +0 -0
  115. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/__init__.py +0 -0
  116. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/conftest.py +0 -0
  117. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/__init__.py +0 -0
  118. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  119. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/conftest.py +0 -0
  120. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/content/__init__.py +0 -0
  121. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/custom/__init__.py +0 -0
  122. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/custom/conftest.py +0 -0
  123. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  124. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  125. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/custom/test_regex_runner.py +0 -0
  126. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/custom/test_transformer_runners.py +0 -0
  127. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/pii/__init__.py +0 -0
  128. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/pii/conftest.py +0 -0
  129. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/pii/sample_invoice.pdf +0 -0
  130. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/pii/test_pii_detector.py +0 -0
  131. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  132. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/secrets/__init__.py +0 -0
  133. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  134. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  135. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/test_base_detector.py +0 -0
  136. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  137. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  138. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/test_detector_pipeline_types.py +0 -0
  139. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/test_detector_schema_examples.py +0 -0
  140. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/test_detector_types.py +0 -0
  141. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/test_phase2_detectors.py +0 -0
  142. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/test_registry.py +0 -0
  143. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/threat/__init__.py +0 -0
  144. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/threat/test_code_security_detector.py +0 -0
  145. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/detectors/threat/test_yara_detector.py +0 -0
  146. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  147. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/integration/test_wordpress_links_assets.py +0 -0
  148. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_base_source_attachment.py +0 -0
  149. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_base_source_sampling.py +0 -0
  150. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_confluence_source.py +0 -0
  151. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_databricks_source.py +0 -0
  152. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_hashing.py +0 -0
  153. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_hive_source.py +0 -0
  154. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_jira_source.py +0 -0
  155. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_mongodb_source.py +0 -0
  156. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_mssql_source.py +0 -0
  157. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_mysql_source.py +0 -0
  158. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_neo4j_source.py +0 -0
  159. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_oracle_source.py +0 -0
  160. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_outputs.py +0 -0
  161. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_postgresql_source.py +0 -0
  162. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_powerbi_source.py +0 -0
  163. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_recipe_normalizer.py +0 -0
  164. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_servicedesk_source.py +0 -0
  165. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_slack_source.py +0 -0
  166. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_snowflake_source.py +0 -0
  167. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_source_dependency_groups.py +0 -0
  168. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_tableau_source.py +0 -0
  169. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_tabular_utils.py +0 -0
  170. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/test_wordpress_source.py +0 -0
  171. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/utils/test_content_extraction.py +0 -0
  172. {classifyre_cli-0.4.2 → classifyre_cli-0.4.3}/tests/utils/test_file_parser.py +0 -0
@@ -0,0 +1,3 @@
1
+ $ uv sync
2
+ Resolved 256 packages in 159ms
3
+ Checked 49 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.2",
3
+ "version": "0.4.3",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.2"
3
+ version = "0.4.3"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import asyncio
5
6
  import logging
6
7
 
7
8
  from ...models.generated_detectors import (
@@ -34,6 +35,11 @@ class CustomDetector(BaseDetector):
34
35
 
35
36
  async def detect(
36
37
  self, content: str | bytes, content_type: str = "text/plain"
38
+ ) -> list[DetectionResult]:
39
+ return await asyncio.to_thread(self._detect_sync, content, content_type)
40
+
41
+ def _detect_sync(
42
+ self, content: str | bytes, content_type: str = "text/plain"
37
43
  ) -> list[DetectionResult]:
38
44
  findings = self._runner.detect(content, content_type)
39
45
  max_findings = self.custom_config.max_findings
@@ -5,6 +5,7 @@ plugin's ``analyze_line`` directly. No temp files, no global Settings state,
5
5
  and no ``SecretsCollection`` needed.
6
6
  """
7
7
 
8
+ import asyncio
8
9
  import importlib
9
10
  import logging
10
11
  import pkgutil
@@ -304,7 +305,9 @@ class SecretsDetector(BaseDetector):
304
305
  len(content),
305
306
  )
306
307
  return []
308
+ return await asyncio.to_thread(self._detect_sync, content)
307
309
 
310
+ def _detect_sync(self, content: str) -> list[DetectionResult]:
308
311
  plugins = self._build_plugins()
309
312
  if not plugins:
310
313
  return []
@@ -1,5 +1,6 @@
1
1
  """Code security detector using Bandit static analysis."""
2
2
 
3
+ import asyncio
3
4
  import json
4
5
  import logging
5
6
  import subprocess
@@ -137,7 +138,9 @@ class CodeSecurityDetector(BaseDetector):
137
138
  return []
138
139
  if not content.strip():
139
140
  return []
141
+ return await asyncio.to_thread(self._detect_sync, content)
140
142
 
143
+ def _detect_sync(self, content: str) -> list[DetectionResult]:
141
144
  threshold = self._cfg.confidence_threshold or 0.7
142
145
  max_findings = self._cfg.max_findings or 25
143
146
  findings: list[DetectionResult] = []
@@ -1,5 +1,6 @@
1
1
  """YARA-based threat detector — compiles structured rule objects into a live ruleset."""
2
2
 
3
+ import asyncio
3
4
  import logging
4
5
  import re
5
6
 
@@ -89,6 +90,13 @@ class YaraDetector(BaseDetector):
89
90
 
90
91
  async def detect(
91
92
  self, content: str | bytes, content_type: str = "text/plain"
93
+ ) -> list[DetectionResult]:
94
+ if self._rules is None:
95
+ return []
96
+ return await asyncio.to_thread(self._detect_sync, content, content_type)
97
+
98
+ def _detect_sync(
99
+ self, content: str | bytes, content_type: str = "text/plain"
92
100
  ) -> list[DetectionResult]:
93
101
  if self._rules is None:
94
102
  return []
@@ -179,15 +179,32 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
179
179
  sink_started = True
180
180
 
181
181
  from .pipeline.detector_pipeline import DetectorPipeline
182
+ from .pipeline.worker_pool import (
183
+ DetectorWorkerPool,
184
+ compute_pool_workers,
185
+ )
186
+
187
+ pool_workers = compute_pool_workers(
188
+ override=args.max_pool_workers,
189
+ )
190
+ worker_pool: DetectorWorkerPool | None = None
182
191
 
183
192
  pipeline = DetectorPipeline.from_recipe(
184
193
  recipe,
185
194
  source,
186
195
  runner_id,
187
- max_concurrent_assets=args.detector_max_concurrent,
188
196
  )
189
197
  has_detectors = bool(pipeline.detectors)
190
198
 
199
+ if has_detectors:
200
+ worker_pool = DetectorWorkerPool(max_workers=pool_workers)
201
+ pipeline = DetectorPipeline.from_recipe(
202
+ recipe,
203
+ source,
204
+ runner_id,
205
+ worker_pool=worker_pool,
206
+ )
207
+
191
208
  # --- Phase 1: Discovery ---
192
209
  source.set_discovery_only(True)
193
210
  all_stubs: list[Any] = []
@@ -225,74 +242,75 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
225
242
  if has_detectors and all_stubs:
226
243
  import asyncio as _asyncio
227
244
 
228
- workers = args.processing_workers
229
- semaphore = _asyncio.Semaphore(workers)
230
245
  processed_count = 0
246
+ logger.info(
247
+ "Phase 2 starting: %d assets, pool_workers=%s",
248
+ len(all_stubs),
249
+ worker_pool.max_workers if worker_pool else "none",
250
+ )
231
251
  error_count = 0
232
252
 
233
253
  async def _process_one(asset: Any) -> None:
234
254
  nonlocal processed_count, error_count
235
- async with semaphore:
236
- asset_hash = getattr(asset, "hash", None) or ""
237
- try:
238
- if hasattr(sink, "update_asset_status"):
239
- await sink.update_asset_status(asset_hash, "PROCESSING")
240
-
241
- async def _on_findings_flushed(partial: list[Any]) -> None:
242
- # partial is the full accumulated findings list from the pipeline
243
- stub_payload = _asset_to_payload(asset)
244
- stub_payload["findings"] = [
245
- f.model_dump(mode="json", exclude_none=True)
246
- if hasattr(f, "model_dump")
247
- else f
248
- for f in partial
249
- ]
250
- await sink.emit_batch([stub_payload], skip_findings=False)
251
- if hasattr(sink, "update_asset_status"):
252
- f_total, f_by_sev, f_by_det = _compute_findings_counts(
253
- partial
254
- )
255
- await sink.update_asset_status(
256
- asset_hash,
257
- "PROCESSING",
258
- findings_total=f_total,
259
- findings_by_severity=f_by_sev,
260
- findings_by_detector=f_by_det,
261
- )
262
-
263
- result = await pipeline.process_single_asset(
264
- asset,
265
- on_findings_flushed=_on_findings_flushed,
266
- findings_flush_size=args.detector_flush_batch_size,
267
- )
268
- payload = _asset_to_payload(result)
269
- await sink.emit_batch([payload], skip_findings=False)
270
-
255
+ asset_hash = getattr(asset, "hash", None) or ""
256
+ try:
257
+ if hasattr(sink, "update_asset_status"):
258
+ await sink.update_asset_status(asset_hash, "PROCESSING")
259
+
260
+ async def _on_findings_flushed(partial: list[Any]) -> None:
261
+ stub_payload = _asset_to_payload(asset)
262
+ stub_payload["findings"] = [
263
+ f.model_dump(mode="json", exclude_none=True)
264
+ if hasattr(f, "model_dump")
265
+ else f
266
+ for f in partial
267
+ ]
268
+ await sink.emit_batch([stub_payload], skip_findings=False)
271
269
  if hasattr(sink, "update_asset_status"):
272
270
  f_total, f_by_sev, f_by_det = _compute_findings_counts(
273
- result.findings or []
271
+ partial
274
272
  )
275
273
  await sink.update_asset_status(
276
274
  asset_hash,
277
- "PROCESSED",
275
+ "PROCESSING",
278
276
  findings_total=f_total,
279
277
  findings_by_severity=f_by_sev,
280
278
  findings_by_detector=f_by_det,
281
279
  )
282
280
 
283
- source.evict_asset_cache(asset_hash)
284
- processed_count += 1
285
- except Exception as exc:
286
- error_count += 1
287
- logger.error("Asset %s failed: %s", asset_hash, exc)
288
- if hasattr(sink, "update_asset_status"):
289
- try:
290
- error_msg = str(exc) or type(exc).__name__
291
- await sink.update_asset_status(
292
- asset_hash, "ERROR", error_msg
293
- )
294
- except Exception:
295
- pass
281
+ result = await pipeline.process_single_asset(
282
+ asset,
283
+ on_findings_flushed=_on_findings_flushed,
284
+ findings_flush_size=args.detector_flush_batch_size,
285
+ )
286
+ payload = _asset_to_payload(result)
287
+ await sink.emit_batch([payload], skip_findings=False)
288
+
289
+ if hasattr(sink, "update_asset_status"):
290
+ f_total, f_by_sev, f_by_det = _compute_findings_counts(
291
+ result.findings or []
292
+ )
293
+ await sink.update_asset_status(
294
+ asset_hash,
295
+ "PROCESSED",
296
+ findings_total=f_total,
297
+ findings_by_severity=f_by_sev,
298
+ findings_by_detector=f_by_det,
299
+ )
300
+
301
+ source.evict_asset_cache(asset_hash)
302
+ processed_count += 1
303
+ except Exception as exc:
304
+ error_count += 1
305
+ logger.error("Asset %s failed: %s", asset_hash, exc)
306
+ if hasattr(sink, "update_asset_status"):
307
+ try:
308
+ error_msg = str(exc) or type(exc).__name__
309
+ await sink.update_asset_status(
310
+ asset_hash, "ERROR", error_msg
311
+ )
312
+ except Exception:
313
+ pass
296
314
 
297
315
  tasks = [_asyncio.create_task(_process_one(a)) for a in all_stubs]
298
316
  await _asyncio.gather(*tasks, return_exceptions=True)
@@ -340,6 +358,9 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
340
358
  "Failed to mark sink failure: %s", sink_error, exc_info=True
341
359
  )
342
360
  raise
361
+ finally:
362
+ if worker_pool is not None:
363
+ worker_pool.shutdown(wait=True)
343
364
 
344
365
  except Exception as e:
345
366
  logger.debug("Traceback for %s failure:", args.command, exc_info=True)
@@ -527,16 +548,10 @@ def main() -> None:
527
548
  help="How many detector-processed assets to accumulate before pushing findings to the API (default: 5, env: CLASSIFYRE_DETECTOR_FLUSH_BATCH_SIZE)",
528
549
  )
529
550
  parser.add_argument(
530
- "--detector-max-concurrent",
531
- type=int,
532
- default=None,
533
- help="Max assets processed in parallel by the detector pipeline (default: 10, env: CLASSIFYRE_DETECTOR_MAX_CONCURRENT)",
534
- )
535
- parser.add_argument(
536
- "--processing-workers",
551
+ "--max-pool-workers",
537
552
  type=int,
538
553
  default=None,
539
- help="Number of parallel asset-processing workers in Phase 2 (default: 2, env: CLASSIFYRE_PROCESSING_WORKERS)",
554
+ help="Max OS processes in the detector pool. Auto-sized from CPU/memory when omitted (env: CLASSIFYRE_MAX_POOL_WORKERS)",
540
555
  )
541
556
 
542
557
  args = parser.parse_args()
@@ -549,21 +564,12 @@ def main() -> None:
549
564
  args.detector_flush_batch_size = 5
550
565
  args.detector_flush_batch_size = max(args.detector_flush_batch_size, 1)
551
566
 
552
- if args.detector_max_concurrent is None:
553
- env_val = os.environ.get("CLASSIFYRE_DETECTOR_MAX_CONCURRENT")
554
- try:
555
- args.detector_max_concurrent = int(env_val) if env_val else 10
556
- except ValueError:
557
- args.detector_max_concurrent = 10
558
- args.detector_max_concurrent = max(args.detector_max_concurrent, 1)
559
-
560
- if args.processing_workers is None:
561
- env_val = os.environ.get("CLASSIFYRE_PROCESSING_WORKERS")
567
+ if args.max_pool_workers is None:
568
+ env_val = os.environ.get("CLASSIFYRE_MAX_POOL_WORKERS")
562
569
  try:
563
- args.processing_workers = int(env_val) if env_val else 2
570
+ args.max_pool_workers = int(env_val) if env_val else None
564
571
  except ValueError:
565
- args.processing_workers = 2
566
- args.processing_workers = max(args.processing_workers, 1)
572
+ args.max_pool_workers = None
567
573
 
568
574
  if args.debug:
569
575
  logging.getLogger().setLevel(logging.DEBUG)
@@ -163,17 +163,11 @@ class ResourceOverrides(BaseModel):
163
163
  ge=60,
164
164
  le=86400,
165
165
  )
166
- processing_workers: int | None = Field(
166
+ max_pool_workers: int | None = Field(
167
167
  None,
168
- description='Number of parallel asset-processing workers in Phase 2 (default: 2)',
168
+ description='Max OS processes in the detector pool. Auto-sized from CPU/memory limits when omitted.',
169
169
  ge=1,
170
- le=20,
171
- )
172
- detector_max_concurrent: int | None = Field(
173
- None,
174
- description='Max concurrent detector invocations across all pages (default: 5)',
175
- ge=1,
176
- le=50,
170
+ le=16,
177
171
  )
178
172
 
179
173