classifyre-cli 0.4.3__tar.gz → 0.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/.turbo/turbo-build.log +1 -1
  2. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/package.json +1 -1
  4. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/pyproject.toml +1 -1
  5. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/main.py +32 -1
  6. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/models/generated_input.py +64 -0
  7. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/pipeline/detector_pipeline.py +60 -35
  8. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/pipeline/worker_pool.py +17 -10
  9. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/databricks/source.py +287 -672
  10. classifyre_cli-0.4.4/src/sources/hive/source.py +304 -0
  11. classifyre_cli-0.4.4/src/sources/mssql/source.py +621 -0
  12. classifyre_cli-0.4.4/src/sources/mysql/source.py +303 -0
  13. classifyre_cli-0.4.4/src/sources/oracle/source.py +632 -0
  14. classifyre_cli-0.4.4/src/sources/postgresql/source.py +214 -0
  15. classifyre_cli-0.4.4/src/sources/snowflake/source.py +624 -0
  16. classifyre_cli-0.4.4/src/sources/sqlite/source.py +212 -0
  17. classifyre_cli-0.4.4/src/sources/tabular_base.py +793 -0
  18. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/tabular_utils.py +36 -0
  19. classifyre_cli-0.4.4/tests/detectors/threat/__init__.py +0 -0
  20. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/pipeline/test_detector_pipeline.py +1 -4
  21. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/pipeline/test_worker_pool.py +1 -0
  22. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_databricks_source.py +9 -9
  23. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_hive_source.py +8 -8
  24. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_mssql_source.py +5 -5
  25. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_mysql_source.py +8 -8
  26. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_oracle_source.py +27 -41
  27. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_postgresql_source.py +3 -0
  28. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_snowflake_source.py +2 -2
  29. classifyre_cli-0.4.4/tests/test_sqlite_source.py +336 -0
  30. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/uv.lock +2 -2
  31. classifyre_cli-0.4.3/src/sources/hive/source.py +0 -709
  32. classifyre_cli-0.4.3/src/sources/mssql/source.py +0 -1034
  33. classifyre_cli-0.4.3/src/sources/mysql/source.py +0 -797
  34. classifyre_cli-0.4.3/src/sources/oracle/source.py +0 -982
  35. classifyre_cli-0.4.3/src/sources/postgresql/source.py +0 -774
  36. classifyre_cli-0.4.3/src/sources/snowflake/source.py +0 -912
  37. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/.gitignore +0 -0
  38. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/.python-version +0 -0
  39. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/README.md +0 -0
  40. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/main.py +0 -0
  41. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/scripts/generate_models.py +0 -0
  42. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/__init__.py +0 -0
  43. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/__init__.py +0 -0
  44. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/base.py +0 -0
  45. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/broken_links/__init__.py +0 -0
  46. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/broken_links/detector.py +0 -0
  47. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/config.py +0 -0
  48. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/content/__init__.py +0 -0
  49. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/__init__.py +0 -0
  50. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/detector.py +0 -0
  51. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/__init__.py +0 -0
  52. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_base.py +0 -0
  53. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_factory.py +0 -0
  54. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  55. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_gliner2.py +0 -0
  56. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_image_classification.py +0 -0
  57. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_llm.py +0 -0
  58. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_object_detection.py +0 -0
  59. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_regex.py +0 -0
  60. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_text_classification.py +0 -0
  61. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/trainer.py +0 -0
  62. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/dependencies.py +0 -0
  63. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/pii/__init__.py +0 -0
  64. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/pii/detector.py +0 -0
  65. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/secrets/__init__.py +0 -0
  66. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/secrets/detector.py +0 -0
  67. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/threat/__init__.py +0 -0
  68. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/threat/code_security_detector.py +0 -0
  69. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/threat/yara_detector.py +0 -0
  70. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/models/generated_detectors.py +0 -0
  71. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/models/generated_single_asset_scan_results.py +0 -0
  72. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/outputs/__init__.py +0 -0
  73. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/outputs/base.py +0 -0
  74. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/outputs/console.py +0 -0
  75. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/outputs/factory.py +0 -0
  76. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/outputs/file.py +0 -0
  77. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/outputs/rest.py +0 -0
  78. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/pipeline/__init__.py +0 -0
  79. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/pipeline/content_provider.py +0 -0
  80. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/pipeline/parsed_content_provider.py +0 -0
  81. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sandbox/__init__.py +0 -0
  82. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sandbox/runner.py +0 -0
  83. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/__init__.py +0 -0
  84. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/atlassian_common.py +0 -0
  85. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/azure_blob_storage/__init__.py +0 -0
  86. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/azure_blob_storage/source.py +0 -0
  87. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/base.py +0 -0
  88. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/confluence/__init__.py +0 -0
  89. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/confluence/source.py +0 -0
  90. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/databricks/__init__.py +0 -0
  91. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/dependencies.py +0 -0
  92. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/google_cloud_storage/__init__.py +0 -0
  93. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/google_cloud_storage/source.py +0 -0
  94. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/hive/__init__.py +0 -0
  95. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/jira/__init__.py +0 -0
  96. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/jira/source.py +0 -0
  97. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/mongodb/__init__.py +0 -0
  98. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/mongodb/source.py +0 -0
  99. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/mssql/__init__.py +0 -0
  100. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/mysql/__init__.py +0 -0
  101. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/neo4j/__init__.py +0 -0
  102. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/neo4j/source.py +0 -0
  103. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/object_storage/base.py +0 -0
  104. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/oracle/__init__.py +0 -0
  105. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/postgresql/__init__.py +0 -0
  106. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/powerbi/__init__.py +0 -0
  107. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/powerbi/source.py +0 -0
  108. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/recipe_normalizer.py +0 -0
  109. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/s3_compatible_storage/README.md +0 -0
  110. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/s3_compatible_storage/__init__.py +0 -0
  111. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/s3_compatible_storage/source.py +0 -0
  112. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/servicedesk/__init__.py +0 -0
  113. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/servicedesk/source.py +0 -0
  114. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/slack/__init__.py +0 -0
  115. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/slack/source.py +0 -0
  116. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/snowflake/__init__.py +0 -0
  117. {classifyre_cli-0.4.3/tests → classifyre_cli-0.4.4/src/sources/sqlite}/__init__.py +0 -0
  118. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/tableau/__init__.py +0 -0
  119. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/tableau/source.py +0 -0
  120. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/wordpress/__init__.py +0 -0
  121. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/wordpress/source.py +0 -0
  122. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/telemetry.py +0 -0
  123. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/utils/__init__.py +0 -0
  124. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/utils/content_extraction.py +0 -0
  125. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/utils/file_parser.py +0 -0
  126. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/utils/hashing.py +0 -0
  127. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/utils/uv_sync.py +0 -0
  128. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/utils/validation.py +0 -0
  129. {classifyre_cli-0.4.3/tests/detectors → classifyre_cli-0.4.4/tests}/__init__.py +0 -0
  130. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/conftest.py +0 -0
  131. {classifyre_cli-0.4.3/tests/detectors/content → classifyre_cli-0.4.4/tests/detectors}/__init__.py +0 -0
  132. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  133. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/conftest.py +0 -0
  134. {classifyre_cli-0.4.3/tests/detectors/custom → classifyre_cli-0.4.4/tests/detectors/content}/__init__.py +0 -0
  135. {classifyre_cli-0.4.3/tests/detectors/pii → classifyre_cli-0.4.4/tests/detectors/custom}/__init__.py +0 -0
  136. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/custom/conftest.py +0 -0
  137. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  138. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  139. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/custom/test_regex_runner.py +0 -0
  140. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/custom/test_transformer_runners.py +0 -0
  141. {classifyre_cli-0.4.3/tests/detectors/secrets → classifyre_cli-0.4.4/tests/detectors/pii}/__init__.py +0 -0
  142. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/pii/conftest.py +0 -0
  143. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/pii/sample_invoice.pdf +0 -0
  144. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/pii/test_pii_detector.py +0 -0
  145. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  146. {classifyre_cli-0.4.3/tests/detectors/threat → classifyre_cli-0.4.4/tests/detectors/secrets}/__init__.py +0 -0
  147. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  148. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  149. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_base_detector.py +0 -0
  150. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  151. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  152. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_detector_pipeline_types.py +0 -0
  153. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_detector_schema_examples.py +0 -0
  154. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_detector_types.py +0 -0
  155. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_phase2_detectors.py +0 -0
  156. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_registry.py +0 -0
  157. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/threat/test_code_security_detector.py +0 -0
  158. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/threat/test_yara_detector.py +0 -0
  159. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  160. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/integration/test_wordpress_links_assets.py +0 -0
  161. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_azure_blob_storage_source.py +0 -0
  162. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_base_source_attachment.py +0 -0
  163. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_base_source_sampling.py +0 -0
  164. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_confluence_source.py +0 -0
  165. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_google_cloud_storage_source.py +0 -0
  166. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_hashing.py +0 -0
  167. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_jira_source.py +0 -0
  168. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_mongodb_source.py +0 -0
  169. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_neo4j_source.py +0 -0
  170. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_outputs.py +0 -0
  171. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_powerbi_source.py +0 -0
  172. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_recipe_normalizer.py +0 -0
  173. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_s3_compatible_storage_source.py +0 -0
  174. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_servicedesk_source.py +0 -0
  175. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_slack_source.py +0 -0
  176. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_source_dependency_groups.py +0 -0
  177. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_tableau_source.py +0 -0
  178. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_tabular_utils.py +0 -0
  179. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_wordpress_source.py +0 -0
  180. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/utils/test_content_extraction.py +0 -0
  181. {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/utils/test_file_parser.py +0 -0
@@ -1,3 +1,3 @@
1
1
  $ uv sync
2
- Resolved 256 packages in 159ms
2
+ Resolved 256 packages in 170ms
3
3
  Checked 49 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.3
3
+ Version: 0.4.4
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.3",
3
+ "version": "0.4.4",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.3"
3
+ version = "0.4.4"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -243,14 +243,23 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
243
243
  import asyncio as _asyncio
244
244
 
245
245
  processed_count = 0
246
+ _pw = worker_pool.max_workers if worker_pool else 4
247
+ max_concurrent = args.max_concurrent_assets or (_pw * 2)
248
+ max_concurrent = max(1, max_concurrent)
249
+ _asset_semaphore = _asyncio.Semaphore(max_concurrent)
246
250
  logger.info(
247
- "Phase 2 starting: %d assets, pool_workers=%s",
251
+ "Phase 2 starting: %d assets, pool_workers=%s, max_concurrent_assets=%d",
248
252
  len(all_stubs),
249
253
  worker_pool.max_workers if worker_pool else "none",
254
+ max_concurrent,
250
255
  )
251
256
  error_count = 0
252
257
 
253
258
  async def _process_one(asset: Any) -> None:
259
+ async with _asset_semaphore:
260
+ await _process_one_inner(asset)
261
+
262
+ async def _process_one_inner(asset: Any) -> None:
254
263
  nonlocal processed_count, error_count
255
264
  asset_hash = getattr(asset, "hash", None) or ""
256
265
  try:
@@ -553,6 +562,12 @@ def main() -> None:
553
562
  default=None,
554
563
  help="Max OS processes in the detector pool. Auto-sized from CPU/memory when omitted (env: CLASSIFYRE_MAX_POOL_WORKERS)",
555
564
  )
565
+ parser.add_argument(
566
+ "--max-concurrent-assets",
567
+ type=int,
568
+ default=None,
569
+ help="Max assets processed concurrently in Phase 2. Controls DB connection usage. Defaults to pool_workers*2 (env: CLASSIFYRE_MAX_CONCURRENT_ASSETS)",
570
+ )
556
571
 
557
572
  args = parser.parse_args()
558
573
 
@@ -571,6 +586,13 @@ def main() -> None:
571
586
  except ValueError:
572
587
  args.max_pool_workers = None
573
588
 
589
+ if args.max_concurrent_assets is None:
590
+ env_val = os.environ.get("CLASSIFYRE_MAX_CONCURRENT_ASSETS")
591
+ try:
592
+ args.max_concurrent_assets = int(env_val) if env_val else None
593
+ except ValueError:
594
+ args.max_concurrent_assets = None
595
+
574
596
  if args.debug:
575
597
  logging.getLogger().setLevel(logging.DEBUG)
576
598
 
@@ -591,6 +613,15 @@ def main() -> None:
591
613
 
592
614
  recipe = load_recipe(args.recipe)
593
615
 
616
+ # Resolve resource overrides from recipe when CLI args / env vars are not set
617
+ recipe_resources = recipe.get("resources") or {}
618
+ if args.max_pool_workers is None and isinstance(recipe_resources.get("max_pool_workers"), int):
619
+ args.max_pool_workers = recipe_resources["max_pool_workers"]
620
+ if args.max_concurrent_assets is None and isinstance(
621
+ recipe_resources.get("max_concurrent_assets"), int
622
+ ):
623
+ args.max_concurrent_assets = recipe_resources["max_concurrent_assets"]
624
+
594
625
  source_type = recipe.get("type", "").lower()
595
626
  if not source_type:
596
627
  logger.error(
@@ -41,6 +41,7 @@ class AssetType(StrEnum):
41
41
  CONFLUENCE = 'CONFLUENCE'
42
42
  JIRA = 'JIRA'
43
43
  SERVICEDESK = 'SERVICEDESK'
44
+ SQLITE = 'SQLITE'
44
45
 
45
46
 
46
47
  class SourceCategory(StrEnum):
@@ -169,6 +170,12 @@ class ResourceOverrides(BaseModel):
169
170
  ge=1,
170
171
  le=16,
171
172
  )
173
+ max_concurrent_assets: int | None = Field(
174
+ None,
175
+ description='Max assets processed concurrently. Controls parallel DB connections. Defaults to pool_workers * 2 when omitted.',
176
+ ge=1,
177
+ le=50,
178
+ )
172
179
 
173
180
 
174
181
  class Detector(BaseModel):
@@ -1836,6 +1843,7 @@ class Type(StrEnum):
1836
1843
  CONFLUENCE = 'CONFLUENCE'
1837
1844
  JIRA = 'JIRA'
1838
1845
  SERVICEDESK = 'SERVICEDESK'
1846
+ SQLITE = 'SQLITE'
1839
1847
 
1840
1848
 
1841
1849
  class SlackInput(CoreInput):
@@ -2622,6 +2630,7 @@ class Type17(StrEnum):
2622
2630
  CONFLUENCE = 'CONFLUENCE'
2623
2631
  JIRA = 'JIRA'
2624
2632
  SERVICEDESK = 'SERVICEDESK'
2633
+ SQLITE = 'SQLITE'
2625
2634
 
2626
2635
 
2627
2636
  class ConfluenceInput(CoreInput):
@@ -2676,6 +2685,59 @@ class ServiceDeskInput(CoreInput):
2676
2685
  resources: ResourceOverrides | None = None
2677
2686
 
2678
2687
 
2688
+ class SQLiteRequired(BaseModel):
2689
+ model_config = ConfigDict(
2690
+ extra='forbid',
2691
+ )
2692
+ database_path: str = Field(
2693
+ ...,
2694
+ description='Absolute or relative path to the SQLite database file (e.g. /data/app.db)',
2695
+ )
2696
+
2697
+
2698
+ class SQLiteOptionalScope(BaseModel):
2699
+ """
2700
+ Table selection scope.
2701
+ """
2702
+
2703
+ model_config = ConfigDict(
2704
+ extra='forbid',
2705
+ )
2706
+ include_tables: list[str] | None = Field(
2707
+ None,
2708
+ description='Optional table allowlist. Only tables in this list will be scanned.',
2709
+ )
2710
+ table_limit: int | None = Field(
2711
+ None, description='Optional cap on number of table assets extracted', ge=1
2712
+ )
2713
+
2714
+
2715
+ class SQLiteOptional(BaseModel):
2716
+ model_config = ConfigDict(
2717
+ extra='forbid',
2718
+ )
2719
+ scope: SQLiteOptionalScope | None = None
2720
+
2721
+
2722
+ class SQLiteInput(CoreInput):
2723
+ type: Literal['SQLITE'] = Field('SQLITE', description='Type of the asset or source')
2724
+ required: SQLiteRequired
2725
+ masked: dict[str, Any] | None = Field(
2726
+ None,
2727
+ description='SQLite has no credentials; this section is intentionally empty.',
2728
+ )
2729
+ optional: SQLiteOptional | None = None
2730
+ detectors: list[Detector] | None = Field(
2731
+ None, description='Detectors to run on ingested content'
2732
+ )
2733
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2734
+ None,
2735
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2736
+ )
2737
+ sampling: SamplingConfig
2738
+ resources: ResourceOverrides | None = None
2739
+
2740
+
2679
2741
  class SourceInput(
2680
2742
  RootModel[
2681
2743
  SlackInput
@@ -2697,6 +2759,7 @@ class SourceInput(
2697
2759
  | ConfluenceInput
2698
2760
  | JiraInput
2699
2761
  | ServiceDeskInput
2762
+ | SQLiteInput
2700
2763
  ]
2701
2764
  ):
2702
2765
  root: (
@@ -2719,6 +2782,7 @@ class SourceInput(
2719
2782
  | ConfluenceInput
2720
2783
  | JiraInput
2721
2784
  | ServiceDeskInput
2785
+ | SQLiteInput
2722
2786
  ) = Field(
2723
2787
  ...,
2724
2788
  description='Merged configuration schema with all source types and common definitions',
@@ -66,9 +66,7 @@ class DetectorPipeline:
66
66
  self.content_provider = ParsedContentProvider(source)
67
67
  self.init_warnings: list[str] = []
68
68
 
69
- def _register_detector_info(
70
- self, detector: BaseDetector, info: _DetectorInfo
71
- ) -> None:
69
+ def _register_detector_info(self, detector: BaseDetector, info: _DetectorInfo) -> None:
72
70
  self._detector_info[id(detector)] = info
73
71
 
74
72
  def _get_detector_info(self, detector: BaseDetector) -> _DetectorInfo | None:
@@ -156,9 +154,7 @@ class DetectorPipeline:
156
154
  all_active = text_detectors + binary_detectors + link_detectors
157
155
  detector_names = [self._detector_log_label(d) for d in all_active]
158
156
  pool_tag = "[pool]" if self._worker_pool else "[in-process]"
159
- logger.info(
160
- "%s Scanning %s [%s]", pool_tag, asset.name, ", ".join(detector_names)
161
- )
157
+ logger.info("%s Scanning %s [%s]", pool_tag, asset.name, ", ".join(detector_names))
162
158
 
163
159
  findings: list[DetectionResult] = []
164
160
  detector_types_run: list[DetectorType] = []
@@ -241,12 +237,13 @@ class DetectorPipeline:
241
237
  if findings:
242
238
  logger.info(
243
239
  "%s Scanned %s: %d finding(s) in %dms",
244
- pool_tag, asset.name, len(findings), scan_duration,
240
+ pool_tag,
241
+ asset.name,
242
+ len(findings),
243
+ scan_duration,
245
244
  )
246
245
  else:
247
- logger.info(
248
- "%s Scanned %s: no findings (%dms)", pool_tag, asset.name, scan_duration
249
- )
246
+ logger.info("%s Scanned %s: no findings (%dms)", pool_tag, asset.name, scan_duration)
250
247
 
251
248
  return asset
252
249
 
@@ -299,7 +296,10 @@ class DetectorPipeline:
299
296
  elapsed = int((time.monotonic() - t0) * 1000)
300
297
  logger.info(
301
298
  " %s page %d done: %d findings (%dms)",
302
- asset.name, page_num, len(page_findings), elapsed,
299
+ asset.name,
300
+ page_num,
301
+ len(page_findings),
302
+ elapsed,
303
303
  )
304
304
  return page_findings, page_types, page_errors, page_content, page_num
305
305
 
@@ -322,9 +322,7 @@ class DetectorPipeline:
322
322
  page_content,
323
323
  )
324
324
 
325
- max_pending = max(
326
- 2, self._worker_pool.max_workers * 2 if self._worker_pool else 4
327
- )
325
+ max_pending = max(2, self._worker_pool.max_workers * 2 if self._worker_pool else 4)
328
326
 
329
327
  async for text_content in self._iter_text_content_pages(asset):
330
328
  page_index += 1
@@ -335,18 +333,22 @@ class DetectorPipeline:
335
333
 
336
334
  while len(pending_tasks) >= max_pending:
337
335
  done, pending_tasks = await asyncio.wait(
338
- pending_tasks, return_when=asyncio.FIRST_COMPLETED,
336
+ pending_tasks,
337
+ return_when=asyncio.FIRST_COMPLETED,
339
338
  )
340
339
  for task in done:
341
340
  page_findings, page_types, page_errors, page_content, _pn = task.result()
342
341
  findings.extend(page_findings)
343
342
  errors.extend(page_errors)
344
343
  detector_types_run = self._merge_detector_types(
345
- detector_types_run, page_types,
344
+ detector_types_run,
345
+ page_types,
346
346
  )
347
347
  for finding in page_findings:
348
348
  self.content_provider.enrich_finding_location(
349
- finding, asset, page_content,
349
+ finding,
350
+ asset,
351
+ page_content,
350
352
  )
351
353
 
352
354
  task = asyncio.create_task(_detect_page(text_content, page_index))
@@ -401,7 +403,10 @@ class DetectorPipeline:
401
403
  elapsed = int((time.monotonic() - t0) * 1000)
402
404
  logger.info(
403
405
  " %s page %d done: %d findings (%dms)",
404
- asset.name, page_num, len(page_findings), elapsed,
406
+ asset.name,
407
+ page_num,
408
+ len(page_findings),
409
+ elapsed,
405
410
  )
406
411
  return page_findings, page_types, page_errors, page_content, page_num
407
412
 
@@ -413,26 +418,29 @@ class DetectorPipeline:
413
418
  page_findings, page_types, page_errors, page_content, _pn = task.result()
414
419
  for finding in page_findings:
415
420
  self.content_provider.enrich_finding_location(
416
- finding, asset, page_content,
421
+ finding,
422
+ asset,
423
+ page_content,
417
424
  )
418
425
  findings.extend(page_findings)
419
426
  errors.extend(page_errors)
420
427
  detector_types_run = self._merge_detector_types(
421
- detector_types_run, page_types,
428
+ detector_types_run,
429
+ page_types,
422
430
  )
423
431
  unflushed_count += len(page_findings)
424
432
 
425
433
  if unflushed_count >= findings_flush_size and unflushed_count > 0:
426
434
  logger.debug(
427
435
  " %s flushing %d findings (%d total)",
428
- asset.name, unflushed_count, len(findings),
436
+ asset.name,
437
+ unflushed_count,
438
+ len(findings),
429
439
  )
430
440
  await on_findings_flushed(list(findings))
431
441
  unflushed_count = 0
432
442
 
433
- max_pending = max(
434
- 2, self._worker_pool.max_workers * 2 if self._worker_pool else 4
435
- )
443
+ max_pending = max(2, self._worker_pool.max_workers * 2 if self._worker_pool else 4)
436
444
 
437
445
  async for text_content in self._iter_text_content_pages(asset):
438
446
  page_index += 1
@@ -443,25 +451,31 @@ class DetectorPipeline:
443
451
 
444
452
  while len(pending_tasks) >= max_pending:
445
453
  done, pending_tasks_set = await asyncio.wait(
446
- pending_tasks, return_when=asyncio.FIRST_COMPLETED,
454
+ pending_tasks,
455
+ return_when=asyncio.FIRST_COMPLETED,
447
456
  )
448
457
  pending_tasks = pending_tasks_set
449
458
  for task in done:
450
459
  page_findings, page_types, page_errors, page_content, _pn = task.result()
451
460
  for finding in page_findings:
452
461
  self.content_provider.enrich_finding_location(
453
- finding, asset, page_content,
462
+ finding,
463
+ asset,
464
+ page_content,
454
465
  )
455
466
  findings.extend(page_findings)
456
467
  errors.extend(page_errors)
457
468
  detector_types_run = self._merge_detector_types(
458
- detector_types_run, page_types,
469
+ detector_types_run,
470
+ page_types,
459
471
  )
460
472
  unflushed_count += len(page_findings)
461
473
  if unflushed_count >= findings_flush_size and unflushed_count > 0:
462
474
  logger.info(
463
475
  " %s flushing %d findings (%d total)",
464
- asset.name, unflushed_count, len(findings),
476
+ asset.name,
477
+ unflushed_count,
478
+ len(findings),
465
479
  )
466
480
  await on_findings_flushed(list(findings))
467
481
  unflushed_count = 0
@@ -637,9 +651,7 @@ class DetectorPipeline:
637
651
  errors: list[str] = []
638
652
  detected_at = datetime.now(UTC)
639
653
 
640
- for i, (detector, result) in enumerate(
641
- zip(runnable_detectors, results, strict=False)
642
- ):
654
+ for i, (detector, result) in enumerate(zip(runnable_detectors, results, strict=False)):
643
655
  detector_name = detector.__class__.__name__
644
656
  via = task_via[i]
645
657
  loc = f"{asset_name}:{page_tag}" if page_tag else asset_name
@@ -648,7 +660,11 @@ class DetectorPipeline:
648
660
  wall_ms = int((time.monotonic() - task_start_times[i]) * 1000)
649
661
  logger.error(
650
662
  " [%s] %s on %s: FAILED in %dms — %s",
651
- via, detector_name, loc, wall_ms, result,
663
+ via,
664
+ detector_name,
665
+ loc,
666
+ wall_ms,
667
+ result,
652
668
  )
653
669
  errors.append(f"{detector_name}: {result}")
654
670
  continue
@@ -677,12 +693,19 @@ class DetectorPipeline:
677
693
  if detector_findings:
678
694
  logger.info(
679
695
  " [%s] %s on %s: %d finding(s) in %dms",
680
- pid_tag, detector_name, loc, len(detector_findings), worker_elapsed,
696
+ pid_tag,
697
+ detector_name,
698
+ loc,
699
+ len(detector_findings),
700
+ worker_elapsed,
681
701
  )
682
702
  else:
683
703
  logger.info(
684
704
  " [%s] %s on %s: clean (%dms)",
685
- pid_tag, detector_name, loc, worker_elapsed,
705
+ pid_tag,
706
+ detector_name,
707
+ loc,
708
+ worker_elapsed,
686
709
  )
687
710
 
688
711
  all_findings.extend(detector_findings)
@@ -835,7 +858,9 @@ class DetectorPipeline:
835
858
 
836
859
  if not detector_configs:
837
860
  return cls(
838
- detectors=[], source=source, runner_id=runner_id,
861
+ detectors=[],
862
+ source=source,
863
+ runner_id=runner_id,
839
864
  worker_pool=worker_pool,
840
865
  )
841
866
 
@@ -37,9 +37,7 @@ class _WorkerResult:
37
37
 
38
38
  __slots__ = ("elapsed_ms", "findings", "worker_pid")
39
39
 
40
- def __init__(
41
- self, findings: list[dict[str, Any]], worker_pid: int, elapsed_ms: int
42
- ) -> None:
40
+ def __init__(self, findings: list[dict[str, Any]], worker_pid: int, elapsed_ms: int) -> None:
43
41
  self.findings = findings
44
42
  self.worker_pid = worker_pid
45
43
  self.elapsed_ms = elapsed_ms
@@ -80,9 +78,7 @@ def _detect_in_worker(
80
78
  from ..detectors import get_detector
81
79
  from ..detectors.config import parse_detector_config
82
80
 
83
- name, typed_config = parse_detector_config(
84
- detector_type, json.loads(config_json)
85
- )
81
+ name, typed_config = parse_detector_config(detector_type, json.loads(config_json))
86
82
  detector = get_detector(name, typed_config)
87
83
  _worker_detector_cache[cache_key] = detector
88
84
  logging.getLogger(__name__).info(
@@ -103,7 +99,9 @@ def _detect_in_worker(
103
99
  elif detector_name == "custom":
104
100
  results = detector._detect_sync(content, content_type)
105
101
  else:
106
- text = content if isinstance(content, str) else content.decode("utf-8", errors="replace")
102
+ text = (
103
+ content if isinstance(content, str) else content.decode("utf-8", errors="replace")
104
+ )
107
105
  results = detector._detect_sync(text)
108
106
  else:
109
107
  results = asyncio.run(detector.detect(content, content_type))
@@ -111,7 +109,10 @@ def _detect_in_worker(
111
109
  elapsed_ms = int((time.monotonic() - t0) * 1000)
112
110
  logging.getLogger(__name__).info(
113
111
  "Worker %d ran %s: %d findings in %dms",
114
- pid, detector_name, len(results), elapsed_ms,
112
+ pid,
113
+ detector_name,
114
+ len(results),
115
+ elapsed_ms,
115
116
  )
116
117
 
117
118
  findings = [
@@ -209,7 +210,11 @@ def compute_pool_workers(override: int | None = None) -> int:
209
210
 
210
211
  logger.info(
211
212
  "Pool sizing: cpu_budget=%d (cpus=%d), mem_budget=%d (%dMB), effective=%d",
212
- cpu_budget, cpus, mem_budget, mem_mb, effective,
213
+ cpu_budget,
214
+ cpus,
215
+ mem_budget,
216
+ mem_mb,
217
+ effective,
213
218
  )
214
219
  return effective
215
220
 
@@ -238,7 +243,9 @@ class DetectorWorkerPool:
238
243
  self._shutdown = False
239
244
  logger.info(
240
245
  "Detector pool started: %d workers (method=%s, pid=%d)",
241
- effective_workers, mp_start_method, os.getpid(),
246
+ effective_workers,
247
+ mp_start_method,
248
+ os.getpid(),
242
249
  )
243
250
 
244
251
  @property