classifyre-cli 0.4.2__tar.gz → 0.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. classifyre_cli-0.4.4/.turbo/turbo-build.log +3 -0
  2. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/package.json +1 -1
  4. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/pyproject.toml +1 -1
  5. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/detector.py +6 -0
  6. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/secrets/detector.py +3 -0
  7. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/threat/code_security_detector.py +3 -0
  8. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/threat/yara_detector.py +8 -0
  9. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/main.py +105 -68
  10. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/models/generated_input.py +63 -5
  11. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/pipeline/detector_pipeline.py +353 -182
  12. classifyre_cli-0.4.4/src/pipeline/worker_pool.py +294 -0
  13. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/azure_blob_storage/source.py +3 -6
  14. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/confluence/source.py +0 -7
  15. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/databricks/source.py +287 -672
  16. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/google_cloud_storage/source.py +2 -8
  17. classifyre_cli-0.4.4/src/sources/hive/source.py +304 -0
  18. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/jira/source.py +0 -7
  19. classifyre_cli-0.4.4/src/sources/mssql/source.py +621 -0
  20. classifyre_cli-0.4.4/src/sources/mysql/source.py +303 -0
  21. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/object_storage/base.py +3 -16
  22. classifyre_cli-0.4.4/src/sources/oracle/source.py +632 -0
  23. classifyre_cli-0.4.4/src/sources/postgresql/source.py +214 -0
  24. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/s3_compatible_storage/source.py +3 -10
  25. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/servicedesk/source.py +0 -7
  26. classifyre_cli-0.4.4/src/sources/snowflake/source.py +624 -0
  27. classifyre_cli-0.4.4/src/sources/sqlite/source.py +212 -0
  28. classifyre_cli-0.4.4/src/sources/tabular_base.py +793 -0
  29. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/tabular_utils.py +36 -0
  30. classifyre_cli-0.4.4/tests/detectors/threat/__init__.py +0 -0
  31. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/pipeline/test_detector_pipeline.py +5 -8
  32. classifyre_cli-0.4.4/tests/pipeline/test_worker_pool.py +480 -0
  33. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_azure_blob_storage_source.py +0 -1
  34. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_databricks_source.py +9 -9
  35. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_google_cloud_storage_source.py +0 -1
  36. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_hive_source.py +8 -8
  37. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_mssql_source.py +5 -5
  38. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_mysql_source.py +8 -8
  39. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_oracle_source.py +27 -41
  40. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_postgresql_source.py +3 -0
  41. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_s3_compatible_storage_source.py +2 -5
  42. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_snowflake_source.py +2 -2
  43. classifyre_cli-0.4.4/tests/test_sqlite_source.py +336 -0
  44. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/uv.lock +6 -2
  45. classifyre_cli-0.4.2/.turbo/turbo-build.log +0 -3
  46. classifyre_cli-0.4.2/src/sources/hive/source.py +0 -709
  47. classifyre_cli-0.4.2/src/sources/mssql/source.py +0 -1034
  48. classifyre_cli-0.4.2/src/sources/mysql/source.py +0 -797
  49. classifyre_cli-0.4.2/src/sources/oracle/source.py +0 -982
  50. classifyre_cli-0.4.2/src/sources/postgresql/source.py +0 -774
  51. classifyre_cli-0.4.2/src/sources/snowflake/source.py +0 -912
  52. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/.gitignore +0 -0
  53. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/.python-version +0 -0
  54. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/README.md +0 -0
  55. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/main.py +0 -0
  56. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/scripts/generate_models.py +0 -0
  57. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/__init__.py +0 -0
  58. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/__init__.py +0 -0
  59. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/base.py +0 -0
  60. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/broken_links/__init__.py +0 -0
  61. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/broken_links/detector.py +0 -0
  62. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/config.py +0 -0
  63. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/content/__init__.py +0 -0
  64. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/__init__.py +0 -0
  65. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/__init__.py +0 -0
  66. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_base.py +0 -0
  67. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_factory.py +0 -0
  68. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  69. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_gliner2.py +0 -0
  70. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_image_classification.py +0 -0
  71. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_llm.py +0 -0
  72. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_object_detection.py +0 -0
  73. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_regex.py +0 -0
  74. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_text_classification.py +0 -0
  75. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/trainer.py +0 -0
  76. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/dependencies.py +0 -0
  77. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/pii/__init__.py +0 -0
  78. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/pii/detector.py +0 -0
  79. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/secrets/__init__.py +0 -0
  80. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/threat/__init__.py +0 -0
  81. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/models/generated_detectors.py +0 -0
  82. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/models/generated_single_asset_scan_results.py +0 -0
  83. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/outputs/__init__.py +0 -0
  84. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/outputs/base.py +0 -0
  85. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/outputs/console.py +0 -0
  86. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/outputs/factory.py +0 -0
  87. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/outputs/file.py +0 -0
  88. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/outputs/rest.py +0 -0
  89. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/pipeline/__init__.py +0 -0
  90. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/pipeline/content_provider.py +0 -0
  91. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/pipeline/parsed_content_provider.py +0 -0
  92. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sandbox/__init__.py +0 -0
  93. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sandbox/runner.py +0 -0
  94. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/__init__.py +0 -0
  95. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/atlassian_common.py +0 -0
  96. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/azure_blob_storage/__init__.py +0 -0
  97. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/base.py +0 -0
  98. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/confluence/__init__.py +0 -0
  99. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/databricks/__init__.py +0 -0
  100. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/dependencies.py +0 -0
  101. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/google_cloud_storage/__init__.py +0 -0
  102. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/hive/__init__.py +0 -0
  103. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/jira/__init__.py +0 -0
  104. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/mongodb/__init__.py +0 -0
  105. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/mongodb/source.py +0 -0
  106. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/mssql/__init__.py +0 -0
  107. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/mysql/__init__.py +0 -0
  108. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/neo4j/__init__.py +0 -0
  109. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/neo4j/source.py +0 -0
  110. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/oracle/__init__.py +0 -0
  111. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/postgresql/__init__.py +0 -0
  112. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/powerbi/__init__.py +0 -0
  113. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/powerbi/source.py +0 -0
  114. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/recipe_normalizer.py +0 -0
  115. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/s3_compatible_storage/README.md +0 -0
  116. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/s3_compatible_storage/__init__.py +0 -0
  117. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/servicedesk/__init__.py +0 -0
  118. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/slack/__init__.py +0 -0
  119. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/slack/source.py +0 -0
  120. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/snowflake/__init__.py +0 -0
  121. {classifyre_cli-0.4.2/tests → classifyre_cli-0.4.4/src/sources/sqlite}/__init__.py +0 -0
  122. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/tableau/__init__.py +0 -0
  123. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/tableau/source.py +0 -0
  124. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/wordpress/__init__.py +0 -0
  125. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/wordpress/source.py +0 -0
  126. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/telemetry.py +0 -0
  127. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/utils/__init__.py +0 -0
  128. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/utils/content_extraction.py +0 -0
  129. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/utils/file_parser.py +0 -0
  130. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/utils/hashing.py +0 -0
  131. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/utils/uv_sync.py +0 -0
  132. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/utils/validation.py +0 -0
  133. {classifyre_cli-0.4.2/tests/detectors → classifyre_cli-0.4.4/tests}/__init__.py +0 -0
  134. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/conftest.py +0 -0
  135. {classifyre_cli-0.4.2/tests/detectors/content → classifyre_cli-0.4.4/tests/detectors}/__init__.py +0 -0
  136. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  137. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/conftest.py +0 -0
  138. {classifyre_cli-0.4.2/tests/detectors/custom → classifyre_cli-0.4.4/tests/detectors/content}/__init__.py +0 -0
  139. {classifyre_cli-0.4.2/tests/detectors/pii → classifyre_cli-0.4.4/tests/detectors/custom}/__init__.py +0 -0
  140. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/custom/conftest.py +0 -0
  141. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  142. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  143. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/custom/test_regex_runner.py +0 -0
  144. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/custom/test_transformer_runners.py +0 -0
  145. {classifyre_cli-0.4.2/tests/detectors/secrets → classifyre_cli-0.4.4/tests/detectors/pii}/__init__.py +0 -0
  146. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/pii/conftest.py +0 -0
  147. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/pii/sample_invoice.pdf +0 -0
  148. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/pii/test_pii_detector.py +0 -0
  149. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  150. {classifyre_cli-0.4.2/tests/detectors/threat → classifyre_cli-0.4.4/tests/detectors/secrets}/__init__.py +0 -0
  151. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  152. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  153. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_base_detector.py +0 -0
  154. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  155. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  156. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_detector_pipeline_types.py +0 -0
  157. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_detector_schema_examples.py +0 -0
  158. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_detector_types.py +0 -0
  159. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_phase2_detectors.py +0 -0
  160. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_registry.py +0 -0
  161. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/threat/test_code_security_detector.py +0 -0
  162. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/threat/test_yara_detector.py +0 -0
  163. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  164. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/integration/test_wordpress_links_assets.py +0 -0
  165. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_base_source_attachment.py +0 -0
  166. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_base_source_sampling.py +0 -0
  167. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_confluence_source.py +0 -0
  168. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_hashing.py +0 -0
  169. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_jira_source.py +0 -0
  170. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_mongodb_source.py +0 -0
  171. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_neo4j_source.py +0 -0
  172. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_outputs.py +0 -0
  173. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_powerbi_source.py +0 -0
  174. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_recipe_normalizer.py +0 -0
  175. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_servicedesk_source.py +0 -0
  176. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_slack_source.py +0 -0
  177. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_source_dependency_groups.py +0 -0
  178. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_tableau_source.py +0 -0
  179. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_tabular_utils.py +0 -0
  180. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_wordpress_source.py +0 -0
  181. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/utils/test_content_extraction.py +0 -0
  182. {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/utils/test_file_parser.py +0 -0
@@ -0,0 +1,3 @@
1
+ $ uv sync
2
+ Resolved 256 packages in 170ms
3
+ Checked 49 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.2
3
+ Version: 0.4.4
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.2",
3
+ "version": "0.4.4",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.2"
3
+ version = "0.4.4"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import asyncio
5
6
  import logging
6
7
 
7
8
  from ...models.generated_detectors import (
@@ -34,6 +35,11 @@ class CustomDetector(BaseDetector):
34
35
 
35
36
  async def detect(
36
37
  self, content: str | bytes, content_type: str = "text/plain"
38
+ ) -> list[DetectionResult]:
39
+ return await asyncio.to_thread(self._detect_sync, content, content_type)
40
+
41
+ def _detect_sync(
42
+ self, content: str | bytes, content_type: str = "text/plain"
37
43
  ) -> list[DetectionResult]:
38
44
  findings = self._runner.detect(content, content_type)
39
45
  max_findings = self.custom_config.max_findings
@@ -5,6 +5,7 @@ plugin's ``analyze_line`` directly. No temp files, no global Settings state,
5
5
  and no ``SecretsCollection`` needed.
6
6
  """
7
7
 
8
+ import asyncio
8
9
  import importlib
9
10
  import logging
10
11
  import pkgutil
@@ -304,7 +305,9 @@ class SecretsDetector(BaseDetector):
304
305
  len(content),
305
306
  )
306
307
  return []
308
+ return await asyncio.to_thread(self._detect_sync, content)
307
309
 
310
+ def _detect_sync(self, content: str) -> list[DetectionResult]:
308
311
  plugins = self._build_plugins()
309
312
  if not plugins:
310
313
  return []
@@ -1,5 +1,6 @@
1
1
  """Code security detector using Bandit static analysis."""
2
2
 
3
+ import asyncio
3
4
  import json
4
5
  import logging
5
6
  import subprocess
@@ -137,7 +138,9 @@ class CodeSecurityDetector(BaseDetector):
137
138
  return []
138
139
  if not content.strip():
139
140
  return []
141
+ return await asyncio.to_thread(self._detect_sync, content)
140
142
 
143
+ def _detect_sync(self, content: str) -> list[DetectionResult]:
141
144
  threshold = self._cfg.confidence_threshold or 0.7
142
145
  max_findings = self._cfg.max_findings or 25
143
146
  findings: list[DetectionResult] = []
@@ -1,5 +1,6 @@
1
1
  """YARA-based threat detector — compiles structured rule objects into a live ruleset."""
2
2
 
3
+ import asyncio
3
4
  import logging
4
5
  import re
5
6
 
@@ -89,6 +90,13 @@ class YaraDetector(BaseDetector):
89
90
 
90
91
  async def detect(
91
92
  self, content: str | bytes, content_type: str = "text/plain"
93
+ ) -> list[DetectionResult]:
94
+ if self._rules is None:
95
+ return []
96
+ return await asyncio.to_thread(self._detect_sync, content, content_type)
97
+
98
+ def _detect_sync(
99
+ self, content: str | bytes, content_type: str = "text/plain"
92
100
  ) -> list[DetectionResult]:
93
101
  if self._rules is None:
94
102
  return []
@@ -179,15 +179,32 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
179
179
  sink_started = True
180
180
 
181
181
  from .pipeline.detector_pipeline import DetectorPipeline
182
+ from .pipeline.worker_pool import (
183
+ DetectorWorkerPool,
184
+ compute_pool_workers,
185
+ )
186
+
187
+ pool_workers = compute_pool_workers(
188
+ override=args.max_pool_workers,
189
+ )
190
+ worker_pool: DetectorWorkerPool | None = None
182
191
 
183
192
  pipeline = DetectorPipeline.from_recipe(
184
193
  recipe,
185
194
  source,
186
195
  runner_id,
187
- max_concurrent_assets=args.detector_max_concurrent,
188
196
  )
189
197
  has_detectors = bool(pipeline.detectors)
190
198
 
199
+ if has_detectors:
200
+ worker_pool = DetectorWorkerPool(max_workers=pool_workers)
201
+ pipeline = DetectorPipeline.from_recipe(
202
+ recipe,
203
+ source,
204
+ runner_id,
205
+ worker_pool=worker_pool,
206
+ )
207
+
191
208
  # --- Phase 1: Discovery ---
192
209
  source.set_discovery_only(True)
193
210
  all_stubs: list[Any] = []
@@ -225,74 +242,84 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
225
242
  if has_detectors and all_stubs:
226
243
  import asyncio as _asyncio
227
244
 
228
- workers = args.processing_workers
229
- semaphore = _asyncio.Semaphore(workers)
230
245
  processed_count = 0
246
+ _pw = worker_pool.max_workers if worker_pool else 4
247
+ max_concurrent = args.max_concurrent_assets or (_pw * 2)
248
+ max_concurrent = max(1, max_concurrent)
249
+ _asset_semaphore = _asyncio.Semaphore(max_concurrent)
250
+ logger.info(
251
+ "Phase 2 starting: %d assets, pool_workers=%s, max_concurrent_assets=%d",
252
+ len(all_stubs),
253
+ worker_pool.max_workers if worker_pool else "none",
254
+ max_concurrent,
255
+ )
231
256
  error_count = 0
232
257
 
233
258
  async def _process_one(asset: Any) -> None:
234
- nonlocal processed_count, error_count
235
- async with semaphore:
236
- asset_hash = getattr(asset, "hash", None) or ""
237
- try:
238
- if hasattr(sink, "update_asset_status"):
239
- await sink.update_asset_status(asset_hash, "PROCESSING")
240
-
241
- async def _on_findings_flushed(partial: list[Any]) -> None:
242
- # partial is the full accumulated findings list from the pipeline
243
- stub_payload = _asset_to_payload(asset)
244
- stub_payload["findings"] = [
245
- f.model_dump(mode="json", exclude_none=True)
246
- if hasattr(f, "model_dump")
247
- else f
248
- for f in partial
249
- ]
250
- await sink.emit_batch([stub_payload], skip_findings=False)
251
- if hasattr(sink, "update_asset_status"):
252
- f_total, f_by_sev, f_by_det = _compute_findings_counts(
253
- partial
254
- )
255
- await sink.update_asset_status(
256
- asset_hash,
257
- "PROCESSING",
258
- findings_total=f_total,
259
- findings_by_severity=f_by_sev,
260
- findings_by_detector=f_by_det,
261
- )
262
-
263
- result = await pipeline.process_single_asset(
264
- asset,
265
- on_findings_flushed=_on_findings_flushed,
266
- findings_flush_size=args.detector_flush_batch_size,
267
- )
268
- payload = _asset_to_payload(result)
269
- await sink.emit_batch([payload], skip_findings=False)
259
+ async with _asset_semaphore:
260
+ await _process_one_inner(asset)
270
261
 
262
+ async def _process_one_inner(asset: Any) -> None:
263
+ nonlocal processed_count, error_count
264
+ asset_hash = getattr(asset, "hash", None) or ""
265
+ try:
266
+ if hasattr(sink, "update_asset_status"):
267
+ await sink.update_asset_status(asset_hash, "PROCESSING")
268
+
269
+ async def _on_findings_flushed(partial: list[Any]) -> None:
270
+ stub_payload = _asset_to_payload(asset)
271
+ stub_payload["findings"] = [
272
+ f.model_dump(mode="json", exclude_none=True)
273
+ if hasattr(f, "model_dump")
274
+ else f
275
+ for f in partial
276
+ ]
277
+ await sink.emit_batch([stub_payload], skip_findings=False)
271
278
  if hasattr(sink, "update_asset_status"):
272
279
  f_total, f_by_sev, f_by_det = _compute_findings_counts(
273
- result.findings or []
280
+ partial
274
281
  )
275
282
  await sink.update_asset_status(
276
283
  asset_hash,
277
- "PROCESSED",
284
+ "PROCESSING",
278
285
  findings_total=f_total,
279
286
  findings_by_severity=f_by_sev,
280
287
  findings_by_detector=f_by_det,
281
288
  )
282
289
 
283
- source.evict_asset_cache(asset_hash)
284
- processed_count += 1
285
- except Exception as exc:
286
- error_count += 1
287
- logger.error("Asset %s failed: %s", asset_hash, exc)
288
- if hasattr(sink, "update_asset_status"):
289
- try:
290
- error_msg = str(exc) or type(exc).__name__
291
- await sink.update_asset_status(
292
- asset_hash, "ERROR", error_msg
293
- )
294
- except Exception:
295
- pass
290
+ result = await pipeline.process_single_asset(
291
+ asset,
292
+ on_findings_flushed=_on_findings_flushed,
293
+ findings_flush_size=args.detector_flush_batch_size,
294
+ )
295
+ payload = _asset_to_payload(result)
296
+ await sink.emit_batch([payload], skip_findings=False)
297
+
298
+ if hasattr(sink, "update_asset_status"):
299
+ f_total, f_by_sev, f_by_det = _compute_findings_counts(
300
+ result.findings or []
301
+ )
302
+ await sink.update_asset_status(
303
+ asset_hash,
304
+ "PROCESSED",
305
+ findings_total=f_total,
306
+ findings_by_severity=f_by_sev,
307
+ findings_by_detector=f_by_det,
308
+ )
309
+
310
+ source.evict_asset_cache(asset_hash)
311
+ processed_count += 1
312
+ except Exception as exc:
313
+ error_count += 1
314
+ logger.error("Asset %s failed: %s", asset_hash, exc)
315
+ if hasattr(sink, "update_asset_status"):
316
+ try:
317
+ error_msg = str(exc) or type(exc).__name__
318
+ await sink.update_asset_status(
319
+ asset_hash, "ERROR", error_msg
320
+ )
321
+ except Exception:
322
+ pass
296
323
 
297
324
  tasks = [_asyncio.create_task(_process_one(a)) for a in all_stubs]
298
325
  await _asyncio.gather(*tasks, return_exceptions=True)
@@ -340,6 +367,9 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
340
367
  "Failed to mark sink failure: %s", sink_error, exc_info=True
341
368
  )
342
369
  raise
370
+ finally:
371
+ if worker_pool is not None:
372
+ worker_pool.shutdown(wait=True)
343
373
 
344
374
  except Exception as e:
345
375
  logger.debug("Traceback for %s failure:", args.command, exc_info=True)
@@ -527,16 +557,16 @@ def main() -> None:
527
557
  help="How many detector-processed assets to accumulate before pushing findings to the API (default: 5, env: CLASSIFYRE_DETECTOR_FLUSH_BATCH_SIZE)",
528
558
  )
529
559
  parser.add_argument(
530
- "--detector-max-concurrent",
560
+ "--max-pool-workers",
531
561
  type=int,
532
562
  default=None,
533
- help="Max assets processed in parallel by the detector pipeline (default: 10, env: CLASSIFYRE_DETECTOR_MAX_CONCURRENT)",
563
+ help="Max OS processes in the detector pool. Auto-sized from CPU/memory when omitted (env: CLASSIFYRE_MAX_POOL_WORKERS)",
534
564
  )
535
565
  parser.add_argument(
536
- "--processing-workers",
566
+ "--max-concurrent-assets",
537
567
  type=int,
538
568
  default=None,
539
- help="Number of parallel asset-processing workers in Phase 2 (default: 2, env: CLASSIFYRE_PROCESSING_WORKERS)",
569
+ help="Max assets processed concurrently in Phase 2. Controls DB connection usage. Defaults to pool_workers*2 (env: CLASSIFYRE_MAX_CONCURRENT_ASSETS)",
540
570
  )
541
571
 
542
572
  args = parser.parse_args()
@@ -549,21 +579,19 @@ def main() -> None:
549
579
  args.detector_flush_batch_size = 5
550
580
  args.detector_flush_batch_size = max(args.detector_flush_batch_size, 1)
551
581
 
552
- if args.detector_max_concurrent is None:
553
- env_val = os.environ.get("CLASSIFYRE_DETECTOR_MAX_CONCURRENT")
582
+ if args.max_pool_workers is None:
583
+ env_val = os.environ.get("CLASSIFYRE_MAX_POOL_WORKERS")
554
584
  try:
555
- args.detector_max_concurrent = int(env_val) if env_val else 10
585
+ args.max_pool_workers = int(env_val) if env_val else None
556
586
  except ValueError:
557
- args.detector_max_concurrent = 10
558
- args.detector_max_concurrent = max(args.detector_max_concurrent, 1)
587
+ args.max_pool_workers = None
559
588
 
560
- if args.processing_workers is None:
561
- env_val = os.environ.get("CLASSIFYRE_PROCESSING_WORKERS")
589
+ if args.max_concurrent_assets is None:
590
+ env_val = os.environ.get("CLASSIFYRE_MAX_CONCURRENT_ASSETS")
562
591
  try:
563
- args.processing_workers = int(env_val) if env_val else 2
592
+ args.max_concurrent_assets = int(env_val) if env_val else None
564
593
  except ValueError:
565
- args.processing_workers = 2
566
- args.processing_workers = max(args.processing_workers, 1)
594
+ args.max_concurrent_assets = None
567
595
 
568
596
  if args.debug:
569
597
  logging.getLogger().setLevel(logging.DEBUG)
@@ -585,6 +613,15 @@ def main() -> None:
585
613
 
586
614
  recipe = load_recipe(args.recipe)
587
615
 
616
+ # Resolve resource overrides from recipe when CLI args / env vars are not set
617
+ recipe_resources = recipe.get("resources") or {}
618
+ if args.max_pool_workers is None and isinstance(recipe_resources.get("max_pool_workers"), int):
619
+ args.max_pool_workers = recipe_resources["max_pool_workers"]
620
+ if args.max_concurrent_assets is None and isinstance(
621
+ recipe_resources.get("max_concurrent_assets"), int
622
+ ):
623
+ args.max_concurrent_assets = recipe_resources["max_concurrent_assets"]
624
+
588
625
  source_type = recipe.get("type", "").lower()
589
626
  if not source_type:
590
627
  logger.error(
@@ -41,6 +41,7 @@ class AssetType(StrEnum):
41
41
  CONFLUENCE = 'CONFLUENCE'
42
42
  JIRA = 'JIRA'
43
43
  SERVICEDESK = 'SERVICEDESK'
44
+ SQLITE = 'SQLITE'
44
45
 
45
46
 
46
47
  class SourceCategory(StrEnum):
@@ -163,15 +164,15 @@ class ResourceOverrides(BaseModel):
163
164
  ge=60,
164
165
  le=86400,
165
166
  )
166
- processing_workers: int | None = Field(
167
+ max_pool_workers: int | None = Field(
167
168
  None,
168
- description='Number of parallel asset-processing workers in Phase 2 (default: 2)',
169
+ description='Max OS processes in the detector pool. Auto-sized from CPU/memory limits when omitted.',
169
170
  ge=1,
170
- le=20,
171
+ le=16,
171
172
  )
172
- detector_max_concurrent: int | None = Field(
173
+ max_concurrent_assets: int | None = Field(
173
174
  None,
174
- description='Max concurrent detector invocations across all pages (default: 5)',
175
+ description='Max assets processed concurrently. Controls parallel DB connections. Defaults to pool_workers * 2 when omitted.',
175
176
  ge=1,
176
177
  le=50,
177
178
  )
@@ -1842,6 +1843,7 @@ class Type(StrEnum):
1842
1843
  CONFLUENCE = 'CONFLUENCE'
1843
1844
  JIRA = 'JIRA'
1844
1845
  SERVICEDESK = 'SERVICEDESK'
1846
+ SQLITE = 'SQLITE'
1845
1847
 
1846
1848
 
1847
1849
  class SlackInput(CoreInput):
@@ -2628,6 +2630,7 @@ class Type17(StrEnum):
2628
2630
  CONFLUENCE = 'CONFLUENCE'
2629
2631
  JIRA = 'JIRA'
2630
2632
  SERVICEDESK = 'SERVICEDESK'
2633
+ SQLITE = 'SQLITE'
2631
2634
 
2632
2635
 
2633
2636
  class ConfluenceInput(CoreInput):
@@ -2682,6 +2685,59 @@ class ServiceDeskInput(CoreInput):
2682
2685
  resources: ResourceOverrides | None = None
2683
2686
 
2684
2687
 
2688
+ class SQLiteRequired(BaseModel):
2689
+ model_config = ConfigDict(
2690
+ extra='forbid',
2691
+ )
2692
+ database_path: str = Field(
2693
+ ...,
2694
+ description='Absolute or relative path to the SQLite database file (e.g. /data/app.db)',
2695
+ )
2696
+
2697
+
2698
+ class SQLiteOptionalScope(BaseModel):
2699
+ """
2700
+ Table selection scope.
2701
+ """
2702
+
2703
+ model_config = ConfigDict(
2704
+ extra='forbid',
2705
+ )
2706
+ include_tables: list[str] | None = Field(
2707
+ None,
2708
+ description='Optional table allowlist. Only tables in this list will be scanned.',
2709
+ )
2710
+ table_limit: int | None = Field(
2711
+ None, description='Optional cap on number of table assets extracted', ge=1
2712
+ )
2713
+
2714
+
2715
+ class SQLiteOptional(BaseModel):
2716
+ model_config = ConfigDict(
2717
+ extra='forbid',
2718
+ )
2719
+ scope: SQLiteOptionalScope | None = None
2720
+
2721
+
2722
+ class SQLiteInput(CoreInput):
2723
+ type: Literal['SQLITE'] = Field('SQLITE', description='Type of the asset or source')
2724
+ required: SQLiteRequired
2725
+ masked: dict[str, Any] | None = Field(
2726
+ None,
2727
+ description='SQLite has no credentials; this section is intentionally empty.',
2728
+ )
2729
+ optional: SQLiteOptional | None = None
2730
+ detectors: list[Detector] | None = Field(
2731
+ None, description='Detectors to run on ingested content'
2732
+ )
2733
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2734
+ None,
2735
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2736
+ )
2737
+ sampling: SamplingConfig
2738
+ resources: ResourceOverrides | None = None
2739
+
2740
+
2685
2741
  class SourceInput(
2686
2742
  RootModel[
2687
2743
  SlackInput
@@ -2703,6 +2759,7 @@ class SourceInput(
2703
2759
  | ConfluenceInput
2704
2760
  | JiraInput
2705
2761
  | ServiceDeskInput
2762
+ | SQLiteInput
2706
2763
  ]
2707
2764
  ):
2708
2765
  root: (
@@ -2725,6 +2782,7 @@ class SourceInput(
2725
2782
  | ConfluenceInput
2726
2783
  | JiraInput
2727
2784
  | ServiceDeskInput
2785
+ | SQLiteInput
2728
2786
  ) = Field(
2729
2787
  ...,
2730
2788
  description='Merged configuration schema with all source types and common definitions',