classifyre-cli 0.4.8__tar.gz → 0.4.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/.turbo/turbo-build.log +1 -1
  2. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/package.json +1 -1
  4. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/pyproject.toml +7 -1
  5. classifyre_cli-0.4.10/src/detectors/custom/runners/_llm.py +230 -0
  6. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/models/generated_detectors.py +143 -5
  7. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/rest.py +71 -0
  8. classifyre_cli-0.4.10/tests/detectors/custom/test_llm_runner.py +157 -0
  9. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_outputs.py +3 -0
  10. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/uv.lock +446 -167
  11. classifyre_cli-0.4.8/src/detectors/custom/runners/_llm.py +0 -22
  12. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/.gitignore +0 -0
  13. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/.python-version +0 -0
  14. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/README.md +0 -0
  15. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/main.py +0 -0
  16. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/scripts/generate_models.py +0 -0
  17. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/__init__.py +0 -0
  18. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/__init__.py +0 -0
  19. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/base.py +0 -0
  20. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/broken_links/__init__.py +0 -0
  21. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/broken_links/detector.py +0 -0
  22. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/config.py +0 -0
  23. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/content/__init__.py +0 -0
  24. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/__init__.py +0 -0
  25. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/detector.py +0 -0
  26. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/extractor.py +0 -0
  27. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/__init__.py +0 -0
  28. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_base.py +0 -0
  29. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_factory.py +0 -0
  30. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  31. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_gliner2.py +0 -0
  32. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_image_classification.py +0 -0
  33. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_object_detection.py +0 -0
  34. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_regex.py +0 -0
  35. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_text_classification.py +0 -0
  36. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/trainer.py +0 -0
  37. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/dependencies.py +0 -0
  38. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/pii/__init__.py +0 -0
  39. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/pii/detector.py +0 -0
  40. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/secrets/__init__.py +0 -0
  41. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/secrets/detector.py +0 -0
  42. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/threat/__init__.py +0 -0
  43. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/threat/code_security_detector.py +0 -0
  44. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/threat/yara_detector.py +0 -0
  45. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/main.py +0 -0
  46. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/models/generated_input.py +0 -0
  47. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/models/generated_single_asset_scan_results.py +0 -0
  48. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/__init__.py +0 -0
  49. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/base.py +0 -0
  50. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/console.py +0 -0
  51. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/factory.py +0 -0
  52. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/file.py +0 -0
  53. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/pipeline/__init__.py +0 -0
  54. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/pipeline/content_provider.py +0 -0
  55. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/pipeline/detector_pipeline.py +0 -0
  56. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/pipeline/parsed_content_provider.py +0 -0
  57. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/pipeline/worker_pool.py +0 -0
  58. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sandbox/__init__.py +0 -0
  59. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sandbox/runner.py +0 -0
  60. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/__init__.py +0 -0
  61. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/atlassian_common.py +0 -0
  62. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/azure_blob_storage/__init__.py +0 -0
  63. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/azure_blob_storage/source.py +0 -0
  64. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/base.py +0 -0
  65. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/confluence/__init__.py +0 -0
  66. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/confluence/source.py +0 -0
  67. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/databricks/__init__.py +0 -0
  68. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/databricks/source.py +0 -0
  69. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/dependencies.py +0 -0
  70. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/google_cloud_storage/__init__.py +0 -0
  71. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/google_cloud_storage/source.py +0 -0
  72. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/hive/__init__.py +0 -0
  73. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/hive/source.py +0 -0
  74. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/jira/__init__.py +0 -0
  75. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/jira/source.py +0 -0
  76. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/mongodb/__init__.py +0 -0
  77. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/mongodb/source.py +0 -0
  78. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/mssql/__init__.py +0 -0
  79. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/mssql/source.py +0 -0
  80. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/mysql/__init__.py +0 -0
  81. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/mysql/source.py +0 -0
  82. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/neo4j/__init__.py +0 -0
  83. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/neo4j/source.py +0 -0
  84. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/object_storage/base.py +0 -0
  85. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/oracle/__init__.py +0 -0
  86. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/oracle/source.py +0 -0
  87. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/postgresql/__init__.py +0 -0
  88. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/postgresql/source.py +0 -0
  89. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/powerbi/__init__.py +0 -0
  90. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/powerbi/source.py +0 -0
  91. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/recipe_normalizer.py +0 -0
  92. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/s3_compatible_storage/README.md +0 -0
  93. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/s3_compatible_storage/__init__.py +0 -0
  94. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/s3_compatible_storage/source.py +0 -0
  95. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/servicedesk/__init__.py +0 -0
  96. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/servicedesk/source.py +0 -0
  97. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/slack/__init__.py +0 -0
  98. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/slack/source.py +0 -0
  99. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/snowflake/__init__.py +0 -0
  100. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/snowflake/source.py +0 -0
  101. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/sqlite/__init__.py +0 -0
  102. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/sqlite/source.py +0 -0
  103. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/tableau/__init__.py +0 -0
  104. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/tableau/source.py +0 -0
  105. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/tabular_base.py +0 -0
  106. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/tabular_utils.py +0 -0
  107. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/wordpress/__init__.py +0 -0
  108. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/wordpress/source.py +0 -0
  109. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/telemetry.py +0 -0
  110. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/utils/__init__.py +0 -0
  111. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/utils/content_extraction.py +0 -0
  112. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/utils/file_parser.py +0 -0
  113. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/utils/hashing.py +0 -0
  114. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/utils/uv_sync.py +0 -0
  115. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/utils/validation.py +0 -0
  116. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/__init__.py +0 -0
  117. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/conftest.py +0 -0
  118. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/__init__.py +0 -0
  119. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  120. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/conftest.py +0 -0
  121. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/content/__init__.py +0 -0
  122. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/custom/__init__.py +0 -0
  123. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/custom/conftest.py +0 -0
  124. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  125. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  126. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/custom/test_regex_runner.py +0 -0
  127. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/custom/test_transformer_runners.py +0 -0
  128. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/pii/__init__.py +0 -0
  129. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/pii/conftest.py +0 -0
  130. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/pii/sample_invoice.pdf +0 -0
  131. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/pii/test_pii_detector.py +0 -0
  132. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  133. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/secrets/__init__.py +0 -0
  134. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  135. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  136. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_base_detector.py +0 -0
  137. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  138. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  139. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_detector_pipeline_types.py +0 -0
  140. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_detector_schema_examples.py +0 -0
  141. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_detector_types.py +0 -0
  142. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_phase2_detectors.py +0 -0
  143. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_registry.py +0 -0
  144. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/threat/__init__.py +0 -0
  145. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/threat/test_code_security_detector.py +0 -0
  146. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/threat/test_yara_detector.py +0 -0
  147. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  148. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/integration/test_wordpress_links_assets.py +0 -0
  149. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/pipeline/test_detector_pipeline.py +0 -0
  150. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/pipeline/test_worker_pool.py +0 -0
  151. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_azure_blob_storage_source.py +0 -0
  152. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_base_source_attachment.py +0 -0
  153. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_base_source_sampling.py +0 -0
  154. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_confluence_source.py +0 -0
  155. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_custom_extractor.py +0 -0
  156. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_databricks_source.py +0 -0
  157. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_google_cloud_storage_source.py +0 -0
  158. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_hashing.py +0 -0
  159. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_hive_source.py +0 -0
  160. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_jira_source.py +0 -0
  161. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_mongodb_source.py +0 -0
  162. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_mssql_source.py +0 -0
  163. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_mysql_source.py +0 -0
  164. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_neo4j_source.py +0 -0
  165. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_oracle_source.py +0 -0
  166. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_postgresql_source.py +0 -0
  167. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_powerbi_source.py +0 -0
  168. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_recipe_normalizer.py +0 -0
  169. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_s3_compatible_storage_source.py +0 -0
  170. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_servicedesk_source.py +0 -0
  171. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_slack_source.py +0 -0
  172. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_snowflake_source.py +0 -0
  173. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_source_dependency_groups.py +0 -0
  174. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_sqlite_source.py +0 -0
  175. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_tableau_source.py +0 -0
  176. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_tabular_utils.py +0 -0
  177. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_wordpress_source.py +0 -0
  178. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/utils/test_content_extraction.py +0 -0
  179. {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/utils/test_file_parser.py +0 -0
@@ -1,3 +1,3 @@
1
1
  $ uv sync
2
- Resolved 256 packages in 185ms
2
+ Resolved 265 packages in 156ms
3
3
  Checked 50 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.8
3
+ Version: 0.4.10
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.8",
3
+ "version": "0.4.10",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.8"
3
+ version = "0.4.10"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -91,6 +91,9 @@ custom = [
91
91
  regex = [
92
92
  "google-re2>=1.1",
93
93
  ]
94
+ llm = [
95
+ "litellm>=1.86.2",
96
+ ]
94
97
  detectors = [
95
98
  { include-group = "file-processing" },
96
99
  { include-group = "privacy" },
@@ -101,6 +104,7 @@ detectors = [
101
104
  { include-group = "classification" },
102
105
  { include-group = "custom" },
103
106
  { include-group = "regex" },
107
+ { include-group = "llm" },
104
108
  ]
105
109
  file-processing = [
106
110
  "filetype>=1.2.0",
@@ -264,6 +268,8 @@ module = [
264
268
  "datasets",
265
269
  "setfit.*",
266
270
  "setfit",
271
+ "litellm.*",
272
+ "litellm",
267
273
  "sklearn.*",
268
274
  "sklearn",
269
275
  "numpy",
@@ -0,0 +1,230 @@
1
+ """AI/LLM pipeline runner — prompt-driven classification and field extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import os
8
+ from datetime import UTC, datetime
9
+ from typing import Any
10
+
11
+ # Quiet litellm's import-time provider preload warnings (bedrock/sagemaker need
12
+ # botocore, which we don't install) before the library is ever imported.
13
+ os.environ.setdefault("LITELLM_LOG", "ERROR")
14
+
15
+ from ....models.generated_detectors import LLMPipelineSchema, Severity
16
+ from ....models.generated_single_asset_scan_results import (
17
+ DetectionResult,
18
+ DetectorType,
19
+ )
20
+ from ...dependencies import require_module
21
+ from ._base import _TEXT_CONTENT_TYPES, BaseRunner, _resolve_pipeline_severity
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Map the stored AI provider type onto the litellm model-string convention.
26
+ _PROVIDER_PREFIX: dict[str, str] = {
27
+ "CLAUDE": "anthropic",
28
+ "GEMINI": "gemini",
29
+ "OPENAI_COMPATIBLE": "openai",
30
+ }
31
+
32
+
33
+ class LLMRunner(BaseRunner):
34
+ """AI detector — sends content to a configured LLM provider for classification + extraction."""
35
+
36
+ def __init__(
37
+ self, schema: LLMPipelineSchema, detector_key: str = "", detector_name: str = ""
38
+ ) -> None:
39
+ self._schema = schema
40
+ self._detector_key = detector_key
41
+ self._detector_name = detector_name
42
+
43
+ runtime = schema.provider_runtime
44
+ if runtime is None:
45
+ raise ValueError(
46
+ f"AI detector '{detector_key}' is missing provider_runtime — the API must "
47
+ "inject resolved provider credentials before dispatch."
48
+ )
49
+ self._runtime = runtime
50
+ self._litellm = require_module("litellm", "llm", ["llm"])
51
+ # Let litellm silently drop params an endpoint doesn't support (e.g.
52
+ # response_format / temperature on some OpenAI-compatible gateways)
53
+ # instead of raising. Keep its own logging quiet.
54
+ self._litellm.drop_params = True
55
+ self._litellm.suppress_debug_info = True
56
+ logging.getLogger("LiteLLM").setLevel(logging.ERROR)
57
+
58
+ def run(self, text: str) -> None: # type: ignore[override] # pragma: no cover
59
+ raise NotImplementedError("LLMRunner uses detect() directly")
60
+
61
+ def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
62
+ if isinstance(content, bytes):
63
+ return []
64
+ if content_type not in _TEXT_CONTENT_TYPES:
65
+ return []
66
+ text = content.strip()
67
+ if not text:
68
+ return []
69
+
70
+ schema = self._schema
71
+ content_limit = schema.content_limit or 8000
72
+ snippet = text[:content_limit]
73
+
74
+ messages = [
75
+ {"role": "system", "content": self._build_system_prompt()},
76
+ {"role": "user", "content": snippet},
77
+ ]
78
+
79
+ try:
80
+ response = self._litellm.completion(
81
+ model=self._model_string(),
82
+ api_key=self._runtime.api_key,
83
+ api_base=self._runtime.base_url or None,
84
+ temperature=schema.temperature if schema.temperature is not None else 0.0,
85
+ max_tokens=self._max_tokens(),
86
+ messages=messages,
87
+ response_format={"type": "json_object"},
88
+ )
89
+ raw = response.choices[0].message.content or "{}"
90
+ parsed = self._parse_json(raw)
91
+ except Exception as exc:
92
+ logger.error(
93
+ "llm detector error (detector=%s, model=%s): %s",
94
+ self._detector_key,
95
+ self._runtime.model,
96
+ exc,
97
+ exc_info=True,
98
+ )
99
+ return []
100
+
101
+ return self._results_from_payload(snippet, parsed)
102
+
103
+ def get_supported_content_types(self) -> list[str]:
104
+ return list(_TEXT_CONTENT_TYPES)
105
+
106
+ # ── Internals ────────────────────────────────────────────────────────────
107
+
108
+ def _max_tokens(self) -> int | None:
109
+ # `max_tokens` is generated as a RootModel[int] wrapper, so unwrap `.root`
110
+ # before handing it to litellm — passing the model object serialises to an
111
+ # invalid request body and fails the whole completion.
112
+ raw = self._schema.max_tokens
113
+ if raw is None:
114
+ return None
115
+ return getattr(raw, "root", raw)
116
+
117
+ def _model_string(self) -> str:
118
+ prefix = _PROVIDER_PREFIX.get(self._runtime.provider.value, "openai")
119
+ return f"{prefix}/{self._runtime.model}"
120
+
121
+ def _build_system_prompt(self) -> str:
122
+ schema = self._schema
123
+ parts: list[str] = [schema.system_prompt.strip()]
124
+
125
+ labels = schema.labels or []
126
+ if labels:
127
+ label_lines = "\n".join(
128
+ f"- {lbl.name}: {lbl.description}" if lbl.description else f"- {lbl.name}"
129
+ for lbl in labels
130
+ )
131
+ parts.append(
132
+ "Classify the content using these labels:\n"
133
+ + label_lines
134
+ + (
135
+ "\nMultiple labels may apply."
136
+ if schema.multi_label
137
+ else "\nChoose the single best label."
138
+ )
139
+ )
140
+
141
+ fields = schema.output_fields or []
142
+ if fields:
143
+ field_lines = "\n".join(
144
+ f"- {f.name} ({f.type.value if f.type else 'string'}): {f.description}"
145
+ if f.description
146
+ else f"- {f.name} ({f.type.value if f.type else 'string'})"
147
+ for f in fields
148
+ )
149
+ parts.append("Also extract these fields:\n" + field_lines)
150
+
151
+ parts.append(
152
+ "Respond with a JSON object of the form: "
153
+ '{"labels": [{"name": "<label>", "confidence": <0-1>, '
154
+ '"matched_content": "<relevant snippet>"}], "fields": {<field name>: <value>}}. '
155
+ "Use only the labels listed above. Return an empty labels array when none apply."
156
+ )
157
+
158
+ if schema.response_example:
159
+ parts.append("Example response:\n" + schema.response_example.strip())
160
+
161
+ return "\n\n".join(parts)
162
+
163
+ @staticmethod
164
+ def _parse_json(raw: str) -> dict[str, Any]:
165
+ try:
166
+ parsed = json.loads(raw)
167
+ except json.JSONDecodeError:
168
+ start = raw.find("{")
169
+ end = raw.rfind("}")
170
+ if start == -1 or end == -1 or end <= start:
171
+ return {}
172
+ try:
173
+ parsed = json.loads(raw[start : end + 1])
174
+ except json.JSONDecodeError:
175
+ return {}
176
+ return parsed if isinstance(parsed, dict) else {}
177
+
178
+ def _results_from_payload(self, snippet: str, payload: dict[str, Any]) -> list[DetectionResult]:
179
+ schema = self._schema
180
+ threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
181
+ default_severity = schema.severity or Severity.info
182
+ extracted = self._coerce_fields(payload.get("fields"))
183
+
184
+ raw_labels = payload.get("labels")
185
+ label_entries: list[dict[str, Any]] = (
186
+ [lbl for lbl in raw_labels if isinstance(lbl, dict)]
187
+ if isinstance(raw_labels, list)
188
+ else []
189
+ )
190
+
191
+ results: list[DetectionResult] = []
192
+ for entry in label_entries:
193
+ label = str(entry.get("name", "")).strip()
194
+ if not label:
195
+ continue
196
+ confidence = float(entry.get("confidence", 1.0) or 0.0)
197
+ if confidence < threshold:
198
+ continue
199
+ severity = _resolve_pipeline_severity(label, schema.severity_map, default_severity)
200
+ matched = str(entry.get("matched_content") or "").strip() or snippet[:320]
201
+ results.append(
202
+ DetectionResult(
203
+ detector_type=DetectorType.CUSTOM,
204
+ finding_type=f"llm:{label}",
205
+ category="CLASSIFICATION",
206
+ severity=severity,
207
+ confidence=min(0.99, confidence),
208
+ matched_content=matched,
209
+ location=None,
210
+ custom_detector_key=self._detector_key,
211
+ custom_detector_name=self._detector_name,
212
+ detected_at=datetime.now(UTC),
213
+ metadata={
214
+ "runner": "LLM",
215
+ "provider": self._runtime.provider.value,
216
+ "model": self._runtime.model,
217
+ "label": label,
218
+ "fields": extracted,
219
+ },
220
+ extracted_data=extracted or None,
221
+ extraction_method="LLM",
222
+ )
223
+ )
224
+
225
+ results.sort(key=lambda r: r.confidence, reverse=True)
226
+ return results
227
+
228
+ @staticmethod
229
+ def _coerce_fields(raw: Any) -> dict[str, Any]:
230
+ return {str(k): v for k, v in raw.items()} if isinstance(raw, dict) else {}
@@ -189,7 +189,7 @@ class DetectorCatalog(RootModel[list[DetectorCatalogEntry]]):
189
189
  'categories': ['CLASSIFICATION', 'COMPLIANCE'],
190
190
  'supported_asset_types': ['TXT', 'TABLE', 'URL', 'IMAGE'],
191
191
  'recommended_model': 'mDeBERTa-v3 + SetFit + GLiNER + HuggingFace transformers',
192
- 'notes': 'User-defined rules and pipelines tailored to specific business needs. Supports regex, GLiNER2, LLM, text classification, image classification, feature extraction, and object detection pipelines.',
192
+ 'notes': 'User-defined rules and pipelines tailored to specific business needs. Supports regex, GLiNER2, AI/LLM (prompt-driven classification + extraction via a configured provider), text classification, image classification, feature extraction, and object detection pipelines.',
193
193
  },
194
194
  ],
195
195
  description='Detector capability catalog used for planning and runtime routing',
@@ -954,18 +954,156 @@ class RegexPipelineSchema(BaseModel):
954
954
  validation: PipelineValidationConfig | None = None
955
955
 
956
956
 
957
+ class LLMLabelDefinition(BaseModel):
958
+ """
959
+ One classification label the AI detector may assign to content.
960
+ """
961
+
962
+ model_config = ConfigDict(
963
+ extra='forbid',
964
+ )
965
+ name: str = Field(
966
+ ...,
967
+ description="Label name returned by the model (e.g. 'good', 'bad', 'violent').",
968
+ )
969
+ description: str | None = Field(
970
+ '', description='Guidance describing when this label applies.'
971
+ )
972
+
973
+
957
974
  class Type3(StrEnum):
975
+ string = 'string'
976
+ number = 'number'
977
+ boolean = 'boolean'
978
+ list_string_ = 'list[string]'
979
+ list_number_ = 'list[number]'
980
+
981
+
982
+ class LLMOutputField(BaseModel):
983
+ """
984
+ One structured property the AI detector extracts and stores in finding metadata and extracted_data.
985
+ """
986
+
987
+ model_config = ConfigDict(
988
+ extra='forbid',
989
+ )
990
+ name: str = Field(
991
+ ..., description='Output field name — becomes a key in extracted_data JSON.'
992
+ )
993
+ description: str | None = Field(
994
+ '', description='Hint for what this field captures.'
995
+ )
996
+ type: Type3 | None = 'string'
997
+
998
+
999
+ class Provider(StrEnum):
1000
+ """
1001
+ Resolved AI provider type.
1002
+ """
1003
+
1004
+ OPENAI_COMPATIBLE = 'OPENAI_COMPATIBLE'
1005
+ CLAUDE = 'CLAUDE'
1006
+ GEMINI = 'GEMINI'
1007
+
1008
+
1009
+ class LLMProviderRuntime(BaseModel):
1010
+ """
1011
+ Runtime-only provider credentials injected by the API at dispatch time. Never persisted with the detector config and rejected on create/update.
1012
+ """
1013
+
1014
+ model_config = ConfigDict(
1015
+ extra='forbid',
1016
+ )
1017
+ provider: Provider = Field(..., description='Resolved AI provider type.')
1018
+ model: str = Field(
1019
+ ...,
1020
+ description='Resolved model identifier (e.g. gpt-4o, claude-sonnet-4-5, gemini-2.0-flash).',
1021
+ )
1022
+ api_key: str = Field(..., description='Decrypted provider API key.')
1023
+ base_url: str | None = Field(
1024
+ None,
1025
+ description='Base URL for OpenAI-compatible endpoints. Null for managed providers.',
1026
+ )
1027
+ context_size: int | None = Field(
1028
+ None, description='Optional context window size configured for the provider.'
1029
+ )
1030
+
1031
+
1032
+ class Type4(StrEnum):
958
1033
  LLM = 'LLM'
959
1034
 
960
1035
 
1036
+ class MaxTokens(RootModel[int]):
1037
+ root: int = Field(
1038
+ None,
1039
+ description='Maximum tokens to generate. Provider default when null.',
1040
+ ge=1,
1041
+ )
1042
+
1043
+
961
1044
  class LLMPipelineSchema(BaseModel):
1045
+ """
1046
+ AI detector pipeline. Sends content to a configured LLM provider with a system prompt, classifies it against a label set, and extracts structured fields. Predicted labels become findings (severity via severity_map); extracted fields are stored in finding metadata and extracted_data.
1047
+ """
1048
+
962
1049
  model_config = ConfigDict(
963
1050
  extra='forbid',
964
1051
  )
965
1052
  type: Literal['LLM'] = 'LLM'
1053
+ system_prompt: str = Field(
1054
+ ...,
1055
+ description='Instruction describing what the model should detect, classify, and extract.',
1056
+ )
1057
+ response_example: str | None = Field(
1058
+ None,
1059
+ description='Optional few-shot example of the JSON the model should return.',
1060
+ )
1061
+ temperature: float | None = Field(
1062
+ 0.0,
1063
+ description='Sampling temperature. Lower is more deterministic.',
1064
+ ge=0.0,
1065
+ le=2.0,
1066
+ )
1067
+ max_tokens: MaxTokens | None = Field(
1068
+ None, description='Maximum tokens to generate. Provider default when null.'
1069
+ )
1070
+ labels: list[LLMLabelDefinition] | None = Field(
1071
+ [],
1072
+ description='Classification taxonomy the model assigns to content.',
1073
+ validate_default=True,
1074
+ )
1075
+ multi_label: bool | None = Field(
1076
+ False, description='Allow more than one label per asset.'
1077
+ )
1078
+ severity: Severity | None = Field(
1079
+ 'info',
1080
+ description='Default severity when no severity_map rule matches a predicted label.',
1081
+ )
1082
+ severity_map: list[PipelineSeverityRule] | None = Field(
1083
+ None,
1084
+ description='Ordered rules mapping predicted labels to severity levels. First matching rule wins.',
1085
+ )
1086
+ confidence_threshold: float | None = Field(
1087
+ 0.5,
1088
+ description='Minimum model confidence to report a label as a finding (0-1).',
1089
+ ge=0.0,
1090
+ le=1.0,
1091
+ )
1092
+ output_fields: list[LLMOutputField] | None = Field(
1093
+ [],
1094
+ description='Structured properties the model extracts. Stored in finding metadata and extracted_data.',
1095
+ validate_default=True,
1096
+ )
1097
+ content_limit: int | None = Field(
1098
+ 8000, description='Maximum characters of content sent to the model.', ge=1
1099
+ )
1100
+ provider_runtime: LLMProviderRuntime | None = Field(
1101
+ None,
1102
+ description='Runtime-only credentials injected by the API at dispatch. Never persisted; rejected on create/update.',
1103
+ )
966
1104
 
967
1105
 
968
- class Type4(StrEnum):
1106
+ class Type5(StrEnum):
969
1107
  TEXT_CLASSIFICATION = 'TEXT_CLASSIFICATION'
970
1108
 
971
1109
 
@@ -1055,7 +1193,7 @@ class TextClassificationPipelineSchema(BaseModel):
1055
1193
  )
1056
1194
 
1057
1195
 
1058
- class Type5(StrEnum):
1196
+ class Type6(StrEnum):
1059
1197
  IMAGE_CLASSIFICATION = 'IMAGE_CLASSIFICATION'
1060
1198
 
1061
1199
 
@@ -1108,7 +1246,7 @@ class ImageClassificationPipelineSchema(BaseModel):
1108
1246
  )
1109
1247
 
1110
1248
 
1111
- class Type6(StrEnum):
1249
+ class Type7(StrEnum):
1112
1250
  FEATURE_EXTRACTION = 'FEATURE_EXTRACTION'
1113
1251
 
1114
1252
 
@@ -1180,7 +1318,7 @@ class FeatureExtractionPipelineSchema(BaseModel):
1180
1318
  )
1181
1319
 
1182
1320
 
1183
- class Type7(StrEnum):
1321
+ class Type8(StrEnum):
1184
1322
  OBJECT_DETECTION = 'OBJECT_DETECTION'
1185
1323
 
1186
1324
 
@@ -1,17 +1,85 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ import random
4
5
  from typing import Any, Literal, cast
5
6
  from urllib.parse import urljoin
6
7
 
7
8
  import requests # type: ignore[import-untyped]
8
9
  from pydantic import BaseModel, ConfigDict, Field
10
+ from requests.adapters import HTTPAdapter
11
+ from urllib3.util.retry import Retry # type: ignore[import-untyped]
9
12
 
10
13
  from .base import OutputRuntimeContext, OutputType
11
14
 
12
15
  logger = logging.getLogger(__name__)
13
16
 
14
17
 
18
+ class _JitteredRetry(Retry):
19
+ """urllib3 Retry subclass that adds ±25 % multiplicative jitter to the
20
+ computed backoff so that multiple concurrent CLI jobs do not all retry
21
+ at exactly the same moment (thundering-herd mitigation).
22
+
23
+ The jitter is applied *after* the standard exponential backoff formula
24
+ and the backoff_max cap, so it never pushes the delay above
25
+ backoff_max * 1.25.
26
+ """
27
+
28
+ _JITTER_FACTOR: float = 0.25
29
+
30
+ def get_backoff_time(self) -> float: # type: ignore[override]
31
+ base = super().get_backoff_time()
32
+ if base == 0:
33
+ return 0.0
34
+ lo = base * (1 - self._JITTER_FACTOR)
35
+ hi = base * (1 + self._JITTER_FACTOR)
36
+ return random.uniform(lo, hi)
37
+
38
+
39
+ # Retry policy for CLI → API REST calls.
40
+ #
41
+ # What we retry and why:
42
+ # connect=8 — pod restarted / not yet ready (RemoteDisconnected, ConnectionReset,
43
+ # ConnectTimeout). Request never reached the application.
44
+ # read=8 — API is under load and slow to respond (ReadTimeout). Safe to retry
45
+ # because all endpoints are idempotent (bulk ingest is upsert-based,
46
+ # status/findings updates are set-operations).
47
+ # status=8 — transient HTTP errors from an overloaded or restarting API:
48
+ # 408 Request Timeout - API-level timeout
49
+ # 429 Too Many Requests - rate-limited / backpressure
50
+ # 502 Bad Gateway - proxy has no upstream yet
51
+ # 503 Service Unavail. - under-pressure / pod not ready
52
+ # 504 Gateway Timeout - upstream took too long
53
+ #
54
+ # backoff_factor=2, backoff_max=60: exponential cap at 60 s, with ±25 % jitter
55
+ # (see _JitteredRetry). Approximate wait schedule between attempts:
56
+ # attempt 1 → immediate (0 s)
57
+ # attempt 2 → ~2 s
58
+ # attempt 3 → ~4 s
59
+ # attempt 4 → ~8 s
60
+ # attempt 5 → ~16 s
61
+ # attempt 6 → ~32 s
62
+ # attempt 7 → ~60 s (capped)
63
+ # attempt 8 → ~60 s (capped)
64
+ # Total extra wait: ~182 s (~3 min) — covers extended load spikes on a
65
+ # single-node VPS before event-loop pressure drops. Worst-case a single
66
+ # call costs 8 * 120 s + 182 s = ~18 min, acceptable for long-running scans.
67
+ #
68
+ # POST and PATCH are explicitly allowed: without this urllib3 only retries
69
+ # idempotent methods (GET/HEAD) by default.
70
+ _RETRY_POLICY = _JitteredRetry(
71
+ total=8,
72
+ connect=8,
73
+ read=8,
74
+ status=8,
75
+ backoff_factor=2,
76
+ backoff_max=60,
77
+ status_forcelist={408, 429, 502, 503, 504},
78
+ allowed_methods={"GET", "POST", "PUT", "PATCH", "DELETE", "HEAD", "OPTIONS"},
79
+ raise_on_status=False,
80
+ )
81
+
82
+
15
83
  def _drop_none_recursive(value: Any) -> Any:
16
84
  if isinstance(value, dict):
17
85
  return {key: _drop_none_recursive(item) for key, item in value.items() if item is not None}
@@ -63,6 +131,9 @@ class RestOutputSink:
63
131
  self.base_url = base_url.rstrip("/")
64
132
  self.timeout_sec = timeout_sec
65
133
  self.session = requests.Session()
134
+ adapter = HTTPAdapter(max_retries=_RETRY_POLICY)
135
+ self.session.mount("http://", adapter)
136
+ self.session.mount("https://", adapter)
66
137
  self._runner_id = context.runner_id
67
138
  self._seen_hashes: set[str] = set()
68
139