classifyre-cli 0.4.4__tar.gz → 0.4.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. classifyre_cli-0.4.6/.turbo/turbo-build.log +3 -0
  2. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/package.json +1 -1
  4. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/pyproject.toml +1 -1
  5. classifyre_cli-0.4.6/src/detectors/custom/extractor.py +261 -0
  6. classifyre_cli-0.4.6/tests/test_custom_extractor.py +291 -0
  7. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/uv.lock +2 -2
  8. classifyre_cli-0.4.4/.turbo/turbo-build.log +0 -3
  9. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/.gitignore +0 -0
  10. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/.python-version +0 -0
  11. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/README.md +0 -0
  12. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/main.py +0 -0
  13. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/scripts/generate_models.py +0 -0
  14. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/__init__.py +0 -0
  15. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/__init__.py +0 -0
  16. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/base.py +0 -0
  17. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/broken_links/__init__.py +0 -0
  18. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/broken_links/detector.py +0 -0
  19. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/config.py +0 -0
  20. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/content/__init__.py +0 -0
  21. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/__init__.py +0 -0
  22. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/detector.py +0 -0
  23. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/runners/__init__.py +0 -0
  24. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/runners/_base.py +0 -0
  25. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/runners/_factory.py +0 -0
  26. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  27. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/runners/_gliner2.py +0 -0
  28. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/runners/_image_classification.py +0 -0
  29. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/runners/_llm.py +0 -0
  30. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/runners/_object_detection.py +0 -0
  31. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/runners/_regex.py +0 -0
  32. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/runners/_text_classification.py +0 -0
  33. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/custom/trainer.py +0 -0
  34. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/dependencies.py +0 -0
  35. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/pii/__init__.py +0 -0
  36. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/pii/detector.py +0 -0
  37. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/secrets/__init__.py +0 -0
  38. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/secrets/detector.py +0 -0
  39. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/threat/__init__.py +0 -0
  40. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/threat/code_security_detector.py +0 -0
  41. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/detectors/threat/yara_detector.py +0 -0
  42. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/main.py +0 -0
  43. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/models/generated_detectors.py +0 -0
  44. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/models/generated_input.py +0 -0
  45. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/models/generated_single_asset_scan_results.py +0 -0
  46. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/outputs/__init__.py +0 -0
  47. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/outputs/base.py +0 -0
  48. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/outputs/console.py +0 -0
  49. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/outputs/factory.py +0 -0
  50. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/outputs/file.py +0 -0
  51. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/outputs/rest.py +0 -0
  52. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/pipeline/__init__.py +0 -0
  53. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/pipeline/content_provider.py +0 -0
  54. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/pipeline/detector_pipeline.py +0 -0
  55. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/pipeline/parsed_content_provider.py +0 -0
  56. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/pipeline/worker_pool.py +0 -0
  57. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sandbox/__init__.py +0 -0
  58. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sandbox/runner.py +0 -0
  59. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/__init__.py +0 -0
  60. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/atlassian_common.py +0 -0
  61. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/azure_blob_storage/__init__.py +0 -0
  62. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/azure_blob_storage/source.py +0 -0
  63. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/base.py +0 -0
  64. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/confluence/__init__.py +0 -0
  65. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/confluence/source.py +0 -0
  66. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/databricks/__init__.py +0 -0
  67. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/databricks/source.py +0 -0
  68. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/dependencies.py +0 -0
  69. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/google_cloud_storage/__init__.py +0 -0
  70. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/google_cloud_storage/source.py +0 -0
  71. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/hive/__init__.py +0 -0
  72. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/hive/source.py +0 -0
  73. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/jira/__init__.py +0 -0
  74. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/jira/source.py +0 -0
  75. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/mongodb/__init__.py +0 -0
  76. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/mongodb/source.py +0 -0
  77. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/mssql/__init__.py +0 -0
  78. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/mssql/source.py +0 -0
  79. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/mysql/__init__.py +0 -0
  80. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/mysql/source.py +0 -0
  81. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/neo4j/__init__.py +0 -0
  82. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/neo4j/source.py +0 -0
  83. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/object_storage/base.py +0 -0
  84. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/oracle/__init__.py +0 -0
  85. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/oracle/source.py +0 -0
  86. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/postgresql/__init__.py +0 -0
  87. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/postgresql/source.py +0 -0
  88. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/powerbi/__init__.py +0 -0
  89. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/powerbi/source.py +0 -0
  90. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/recipe_normalizer.py +0 -0
  91. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/s3_compatible_storage/README.md +0 -0
  92. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/s3_compatible_storage/__init__.py +0 -0
  93. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/s3_compatible_storage/source.py +0 -0
  94. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/servicedesk/__init__.py +0 -0
  95. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/servicedesk/source.py +0 -0
  96. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/slack/__init__.py +0 -0
  97. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/slack/source.py +0 -0
  98. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/snowflake/__init__.py +0 -0
  99. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/snowflake/source.py +0 -0
  100. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/sqlite/__init__.py +0 -0
  101. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/sqlite/source.py +0 -0
  102. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/tableau/__init__.py +0 -0
  103. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/tableau/source.py +0 -0
  104. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/tabular_base.py +0 -0
  105. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/tabular_utils.py +0 -0
  106. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/wordpress/__init__.py +0 -0
  107. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/sources/wordpress/source.py +0 -0
  108. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/telemetry.py +0 -0
  109. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/utils/__init__.py +0 -0
  110. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/utils/content_extraction.py +0 -0
  111. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/utils/file_parser.py +0 -0
  112. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/utils/hashing.py +0 -0
  113. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/utils/uv_sync.py +0 -0
  114. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/src/utils/validation.py +0 -0
  115. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/__init__.py +0 -0
  116. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/conftest.py +0 -0
  117. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/__init__.py +0 -0
  118. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  119. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/conftest.py +0 -0
  120. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/content/__init__.py +0 -0
  121. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/custom/__init__.py +0 -0
  122. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/custom/conftest.py +0 -0
  123. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  124. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  125. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/custom/test_regex_runner.py +0 -0
  126. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/custom/test_transformer_runners.py +0 -0
  127. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/pii/__init__.py +0 -0
  128. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/pii/conftest.py +0 -0
  129. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/pii/sample_invoice.pdf +0 -0
  130. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/pii/test_pii_detector.py +0 -0
  131. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  132. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/secrets/__init__.py +0 -0
  133. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  134. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  135. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/test_base_detector.py +0 -0
  136. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  137. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  138. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/test_detector_pipeline_types.py +0 -0
  139. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/test_detector_schema_examples.py +0 -0
  140. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/test_detector_types.py +0 -0
  141. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/test_phase2_detectors.py +0 -0
  142. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/test_registry.py +0 -0
  143. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/threat/__init__.py +0 -0
  144. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/threat/test_code_security_detector.py +0 -0
  145. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/detectors/threat/test_yara_detector.py +0 -0
  146. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  147. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/integration/test_wordpress_links_assets.py +0 -0
  148. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/pipeline/test_detector_pipeline.py +0 -0
  149. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/pipeline/test_worker_pool.py +0 -0
  150. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_azure_blob_storage_source.py +0 -0
  151. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_base_source_attachment.py +0 -0
  152. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_base_source_sampling.py +0 -0
  153. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_confluence_source.py +0 -0
  154. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_databricks_source.py +0 -0
  155. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_google_cloud_storage_source.py +0 -0
  156. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_hashing.py +0 -0
  157. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_hive_source.py +0 -0
  158. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_jira_source.py +0 -0
  159. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_mongodb_source.py +0 -0
  160. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_mssql_source.py +0 -0
  161. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_mysql_source.py +0 -0
  162. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_neo4j_source.py +0 -0
  163. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_oracle_source.py +0 -0
  164. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_outputs.py +0 -0
  165. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_postgresql_source.py +0 -0
  166. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_powerbi_source.py +0 -0
  167. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_recipe_normalizer.py +0 -0
  168. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_s3_compatible_storage_source.py +0 -0
  169. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_servicedesk_source.py +0 -0
  170. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_slack_source.py +0 -0
  171. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_snowflake_source.py +0 -0
  172. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_source_dependency_groups.py +0 -0
  173. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_sqlite_source.py +0 -0
  174. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_tableau_source.py +0 -0
  175. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_tabular_utils.py +0 -0
  176. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/test_wordpress_source.py +0 -0
  177. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/utils/test_content_extraction.py +0 -0
  178. {classifyre_cli-0.4.4 → classifyre_cli-0.4.6}/tests/utils/test_file_parser.py +0 -0
@@ -0,0 +1,3 @@
1
+ $ uv sync
2
+ Resolved 256 packages in 322ms
3
+ Checked 49 packages in 0.88ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.4
3
+ Version: 0.4.6
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.4",
3
+ "version": "0.4.6",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.4"
3
+ version = "0.4.6"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -0,0 +1,261 @@
1
+ """Custom detector extraction engine — REGEX, GLINER, and CLASSIFIER_GLINER strategies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from dataclasses import dataclass, field
8
+ from typing import Any
9
+
10
+ from ...models.generated_detectors import (
11
+ CustomDetectorMethod,
12
+ CustomExtractorConfig,
13
+ CustomExtractorField,
14
+ )
15
+ from ..dependencies import MissingDependencyError, require_module
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ _DEFAULT_GLINER2_MODEL = "fastino/gliner2-base-v1"
20
+
21
+ # Extraction method tags sent to the API via DetectionResult.extraction_method
22
+ EXTRACTION_METHOD_REGEX = "REGEX"
23
+ EXTRACTION_METHOD_GLINER = "GLINER"
24
+ EXTRACTION_METHOD_CLASSIFIER_GLINER = "CLASSIFIER_GLINER"
25
+
26
+
27
+ @dataclass
28
+ class ExtractionResult:
29
+ """Typed output from one extraction run."""
30
+
31
+ extracted_data: dict[str, Any]
32
+ method: str
33
+ populated_fields: list[str] = field(default_factory=list)
34
+ field_count: int = 0
35
+
36
+ def __post_init__(self) -> None:
37
+ self.populated_fields = [
38
+ k for k, v in self.extracted_data.items() if v is not None and v not in ([], "")
39
+ ]
40
+ self.field_count = len(self.extracted_data)
41
+
42
+
43
+ class CustomExtractor:
44
+ """
45
+ Runs after a custom detector fires to pull structured data from the content.
46
+
47
+ Strategy selection:
48
+ RULESET → REGEX (named capture groups in field.regex_pattern)
49
+ ENTITY → GLINER (group GLiNER2 entity spans by entity_label into fields)
50
+ CLASSIFIER → CLASSIFIER_GLINER (second GLiNER2 pass on wider content slice)
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ config: CustomExtractorConfig,
56
+ detector_method: CustomDetectorMethod,
57
+ ) -> None:
58
+ self._config = config
59
+ self._method = detector_method
60
+ self._gliner_model: Any | None = None
61
+ self._compiled: dict[str, re.Pattern[str]] = {} # pattern cache
62
+
63
+ # ── Public API ───────────────────────────────────────────────────────────
64
+
65
+ def extract(
66
+ self,
67
+ matched_content: str,
68
+ content_for_extraction: str,
69
+ ) -> ExtractionResult | None:
70
+ """
71
+ Run extraction and return structured result, or None if nothing extracted.
72
+
73
+ Args:
74
+ matched_content: The content stored in the finding (may be truncated).
75
+ content_for_extraction: Wider slice of the original document for GLiNER/regex.
76
+ """
77
+ if not self._config.enabled:
78
+ return None
79
+
80
+ if self._method == CustomDetectorMethod.RULESET:
81
+ return self._extract_regex(content_for_extraction)
82
+ if self._method == CustomDetectorMethod.ENTITY:
83
+ return self._extract_gliner(content_for_extraction, EXTRACTION_METHOD_GLINER)
84
+ if self._method == CustomDetectorMethod.CLASSIFIER:
85
+ return self._extract_gliner(content_for_extraction, EXTRACTION_METHOD_CLASSIFIER_GLINER)
86
+ logger.warning("CustomExtractor: unknown detector method %s", self._method)
87
+ return None
88
+
89
+ # ── RULESET — regex named groups ─────────────────────────────────────────
90
+
91
+ def _extract_regex(self, content: str) -> ExtractionResult | None:
92
+ data: dict[str, Any] = {}
93
+
94
+ for f in self._config.fields:
95
+ if not f.regex_pattern:
96
+ logger.debug(
97
+ "Extractor field '%s' has no regex_pattern — skipped for RULESET", f.name
98
+ )
99
+ continue
100
+ value = self._apply_regex_field(content, f)
101
+ if value is not None:
102
+ data[f.name] = value
103
+
104
+ return self._finalize(data, EXTRACTION_METHOD_REGEX)
105
+
106
+ def _apply_regex_field(self, content: str, f: CustomExtractorField) -> Any:
107
+ pattern = self._compile(f.regex_pattern or "", f.regex_flags or "i")
108
+ if pattern is None:
109
+ return None
110
+
111
+ named_groups = pattern.groupindex
112
+ group_name = next(iter(named_groups), None)
113
+
114
+ matches: list[str] = []
115
+ for m in pattern.finditer(content):
116
+ captured = m.group(group_name) if group_name else m.group(0)
117
+ if captured and captured.strip():
118
+ matches.append(captured.strip())
119
+
120
+ return self._aggregate(matches, f) if matches else None
121
+
122
+ def _compile(self, pattern: str, flags_str: str) -> re.Pattern[str] | None:
123
+ cache_key = f"{pattern}::{flags_str}"
124
+ if cache_key in self._compiled:
125
+ return self._compiled[cache_key]
126
+
127
+ flags = 0
128
+ for ch in flags_str:
129
+ if ch == "i":
130
+ flags |= re.IGNORECASE
131
+ elif ch == "m":
132
+ flags |= re.MULTILINE
133
+ elif ch == "s":
134
+ flags |= re.DOTALL
135
+
136
+ try:
137
+ compiled = re.compile(pattern, flags=flags)
138
+ self._compiled[cache_key] = compiled
139
+ return compiled
140
+ except re.error as exc:
141
+ logger.warning("CustomExtractor: invalid regex pattern '%s': %s", pattern, exc)
142
+ return None
143
+
144
+ # ── ENTITY / CLASSIFIER — GLiNER2 entity spans ───────────────────────────
145
+
146
+ def _extract_gliner(self, content: str, method_tag: str) -> ExtractionResult | None:
147
+ label_to_fields: dict[str, list[CustomExtractorField]] = {}
148
+ for f in self._config.fields:
149
+ if f.entity_label:
150
+ label_to_fields.setdefault(f.entity_label, []).append(f)
151
+
152
+ if not label_to_fields:
153
+ logger.debug("CustomExtractor: no fields with entity_label — skipping GLiNER2")
154
+ return None
155
+
156
+ model = self._load_gliner()
157
+ if model is None:
158
+ return None
159
+
160
+ entity_schema = {
161
+ label: next(
162
+ (
163
+ field.description
164
+ for field in fields
165
+ if isinstance(field.description, str) and field.description.strip()
166
+ ),
167
+ "",
168
+ )
169
+ for label, fields in label_to_fields.items()
170
+ }
171
+ try:
172
+ result = model.extract_entities(
173
+ content,
174
+ entity_schema,
175
+ threshold=0.0,
176
+ include_confidence=True,
177
+ )
178
+ except Exception as exc: # pragma: no cover
179
+ logger.warning("CustomExtractor: GLiNER2 extraction failed: %s", exc)
180
+ return None
181
+
182
+ entities = result.get("entities", {})
183
+ if not isinstance(entities, dict):
184
+ return None
185
+
186
+ data: dict[str, Any] = {}
187
+ for entity_label, fields in label_to_fields.items():
188
+ raw_spans = entities.get(entity_label, [])
189
+ if not isinstance(raw_spans, list):
190
+ raw_spans = [raw_spans]
191
+
192
+ for f in fields:
193
+ threshold = f.min_confidence if f.min_confidence is not None else 0.4
194
+ values = self._filter_gliner2_values(raw_spans, threshold)
195
+ value = self._aggregate(values, f) if values else None
196
+ if value is not None:
197
+ data[f.name] = value
198
+
199
+ return self._finalize(data, method_tag)
200
+
201
+ def _filter_gliner2_values(self, raw_spans: list[Any], threshold: float) -> list[str]:
202
+ values: list[str] = []
203
+ for raw_span in raw_spans:
204
+ if isinstance(raw_span, dict):
205
+ score = float(raw_span.get("confidence", raw_span.get("score", 0.0)))
206
+ text = str(raw_span.get("text", "")).strip()
207
+ else:
208
+ score = 1.0
209
+ text = str(raw_span).strip()
210
+
211
+ if score >= threshold and text:
212
+ values.append(text)
213
+
214
+ return values
215
+
216
+ def _load_gliner(self) -> Any | None:
217
+ if self._gliner_model is not None:
218
+ return self._gliner_model
219
+ try:
220
+ gliner2_module = require_module("gliner2", "custom", ["classification", "detectors"])
221
+ model_name = self._config.gliner_model or _DEFAULT_GLINER2_MODEL
222
+ self._gliner_model = gliner2_module.GLiNER2.from_pretrained(model_name)
223
+ return self._gliner_model
224
+ except MissingDependencyError:
225
+ raise
226
+ except Exception as exc: # pragma: no cover
227
+ logger.warning("CustomExtractor: failed to load GLiNER2: %s", exc)
228
+ return None
229
+
230
+ # ── Shared helpers ────────────────────────────────────────────────────────
231
+
232
+ def _aggregate(self, values: list[str], f: CustomExtractorField) -> Any:
233
+ if not values:
234
+ return None
235
+ aggregate = f.aggregate or "list"
236
+ if aggregate == "first":
237
+ return values[0]
238
+ if aggregate == "last":
239
+ return values[-1]
240
+ if aggregate == "list":
241
+ return values
242
+ if aggregate == "join":
243
+ sep = f.join_separator if f.join_separator is not None else ", "
244
+ return sep.join(values)
245
+ if aggregate == "count":
246
+ return len(values)
247
+ return values # fallback
248
+
249
+ def _finalize(self, data: dict[str, Any], method: str) -> ExtractionResult | None:
250
+ # Required fields gate: if any required field is missing, discard the result
251
+ for f in self._config.fields:
252
+ if f.required and f.name not in data:
253
+ logger.debug(
254
+ "CustomExtractor: required field '%s' not populated — discarding", f.name
255
+ )
256
+ return None
257
+
258
+ if not data:
259
+ return None
260
+
261
+ return ExtractionResult(extracted_data=data, method=method)
@@ -0,0 +1,291 @@
1
+ """Tests for CustomExtractor — all three strategies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from src.detectors.custom.extractor import (
6
+ EXTRACTION_METHOD_CLASSIFIER_GLINER,
7
+ EXTRACTION_METHOD_GLINER,
8
+ EXTRACTION_METHOD_REGEX,
9
+ CustomExtractor,
10
+ ExtractionResult,
11
+ )
12
+ from src.models.generated_detectors import (
13
+ CustomDetectorMethod,
14
+ CustomExtractorConfig,
15
+ CustomExtractorField,
16
+ )
17
+
18
+
19
+ def make_config(*fields_kwargs: dict, **config_kwargs) -> CustomExtractorConfig:
20
+ fields = [CustomExtractorField(**kw) for kw in fields_kwargs]
21
+ return CustomExtractorConfig(fields=fields, **config_kwargs)
22
+
23
+
24
+ # ── RULESET / REGEX ──────────────────────────────────────────────────────────
25
+
26
+
27
+ class TestRegexExtraction:
28
+ def _extractor(self, *fields_kwargs: dict) -> CustomExtractor:
29
+ return CustomExtractor(make_config(*fields_kwargs), CustomDetectorMethod.RULESET)
30
+
31
+ def test_extracts_named_group(self):
32
+ ex = self._extractor(
33
+ {
34
+ "name": "amount",
35
+ "regex_pattern": r"(?P<value>\d+[.,]\d+)\s*EUR",
36
+ "aggregate": "first",
37
+ }
38
+ )
39
+ result = ex.extract("price is 29.99 EUR today", "price is 29.99 EUR today")
40
+ assert result is not None
41
+ assert result.extracted_data["amount"] == "29.99"
42
+ assert result.method == EXTRACTION_METHOD_REGEX
43
+ assert "amount" in result.populated_fields
44
+
45
+ def test_list_aggregate_collects_all(self):
46
+ ex = self._extractor(
47
+ {
48
+ "name": "emails",
49
+ "regex_pattern": r"(?P<value>[a-z]+@[a-z]+\.[a-z]+)",
50
+ "aggregate": "list",
51
+ }
52
+ )
53
+ result = ex.extract("a@b.com and c@d.com", "a@b.com and c@d.com")
54
+ assert result is not None
55
+ assert result.extracted_data["emails"] == ["a@b.com", "c@d.com"]
56
+
57
+ def test_join_aggregate(self):
58
+ ex = self._extractor(
59
+ {
60
+ "name": "tags",
61
+ "regex_pattern": r"#(?P<value>\w+)",
62
+ "aggregate": "join",
63
+ "join_separator": " | ",
64
+ }
65
+ )
66
+ result = ex.extract("found #food and #recipe here", "found #food and #recipe here")
67
+ assert result is not None
68
+ assert result.extracted_data["tags"] == "food | recipe"
69
+
70
+ def test_count_aggregate(self):
71
+ ex = self._extractor(
72
+ {
73
+ "name": "mention_count",
74
+ "regex_pattern": r"(?P<value>car rental)",
75
+ "aggregate": "count",
76
+ "regex_flags": "i",
77
+ }
78
+ )
79
+ result = ex.extract(
80
+ "car rental here and car rental there", "car rental here and car rental there"
81
+ )
82
+ assert result is not None
83
+ assert result.extracted_data["mention_count"] == 2
84
+
85
+ def test_no_match_returns_none(self):
86
+ ex = self._extractor(
87
+ {"name": "iban", "regex_pattern": r"(?P<value>DE\d{20})", "aggregate": "first"}
88
+ )
89
+ result = ex.extract("no iban here", "no iban here")
90
+ assert result is None
91
+
92
+ def test_required_field_gates_result(self):
93
+ ex = self._extractor(
94
+ {"name": "optional", "regex_pattern": r"(?P<value>foo)", "aggregate": "first"},
95
+ {
96
+ "name": "must_have",
97
+ "regex_pattern": r"(?P<value>REQUIRED)",
98
+ "aggregate": "first",
99
+ "required": True,
100
+ },
101
+ )
102
+ result = ex.extract("foo bar baz", "foo bar baz")
103
+ assert result is None # must_have not populated
104
+
105
+ def test_required_field_allows_result_when_present(self):
106
+ ex = self._extractor(
107
+ {
108
+ "name": "must_have",
109
+ "regex_pattern": r"(?P<value>REQUIRED)",
110
+ "aggregate": "first",
111
+ "required": True,
112
+ },
113
+ )
114
+ result = ex.extract("text with REQUIRED word", "text with REQUIRED word")
115
+ assert result is not None
116
+ assert result.extracted_data["must_have"] == "REQUIRED"
117
+
118
+ def test_invalid_regex_skipped_gracefully(self):
119
+ ex = self._extractor(
120
+ {"name": "bad", "regex_pattern": r"[invalid", "aggregate": "first"},
121
+ {"name": "good", "regex_pattern": r"(?P<value>ok)", "aggregate": "first"},
122
+ )
123
+ result = ex.extract("ok", "ok")
124
+ assert result is not None
125
+ assert "good" in result.extracted_data
126
+ assert "bad" not in result.extracted_data
127
+
128
+ def test_case_insensitive_flag(self):
129
+ ex = self._extractor(
130
+ {
131
+ "name": "word",
132
+ "regex_pattern": r"(?P<value>hello)",
133
+ "aggregate": "first",
134
+ "regex_flags": "i",
135
+ }
136
+ )
137
+ result = ex.extract("HELLO world", "HELLO world")
138
+ assert result is not None
139
+ assert result.extracted_data["word"].lower() == "hello"
140
+
141
+ def test_disabled_extractor_returns_none(self):
142
+ config = make_config(
143
+ {"name": "f", "regex_pattern": r"(?P<value>\w+)", "aggregate": "first"},
144
+ enabled=False,
145
+ )
146
+ ex = CustomExtractor(config, CustomDetectorMethod.RULESET)
147
+ assert ex.extract("hello", "hello") is None
148
+
149
+ def test_extraction_result_populated_fields(self):
150
+ ex = self._extractor(
151
+ {"name": "a", "regex_pattern": r"(?P<value>yes)", "aggregate": "first"},
152
+ {"name": "b", "regex_pattern": r"(?P<value>no)", "aggregate": "first"},
153
+ )
154
+ result = ex.extract("yes only", "yes only")
155
+ assert result is not None
156
+ assert "a" in result.populated_fields
157
+ assert "b" not in result.populated_fields
158
+
159
+
160
+ # ── ENTITY / GLINER ──────────────────────────────────────────────────────────
161
+
162
+
163
+ class TestGlinerExtraction:
164
+ """Tests using a mocked GLiNER model to avoid downloading models."""
165
+
166
+ def _extractor_with_mock_gliner(
167
+ self, fields: list[dict], mock_entities: list[dict]
168
+ ) -> CustomExtractor:
169
+ config = make_config(*fields)
170
+ ex = CustomExtractor(config, CustomDetectorMethod.ENTITY)
171
+
172
+ class MockGliner:
173
+ def extract_entities(self, content: str, labels: dict[str, str], **_kwargs) -> dict:
174
+ entities = {
175
+ label: [
176
+ {"text": e["text"], "confidence": e["score"]}
177
+ for e in mock_entities
178
+ if e.get("label") == label
179
+ ]
180
+ for label in labels
181
+ }
182
+ return {"entities": entities}
183
+
184
+ ex._gliner_model = MockGliner()
185
+ return ex
186
+
187
+ def test_groups_entity_spans_by_label(self):
188
+ ex = self._extractor_with_mock_gliner(
189
+ [
190
+ {
191
+ "name": "persons",
192
+ "entity_label": "person",
193
+ "type": "list[string]",
194
+ "aggregate": "list",
195
+ },
196
+ {
197
+ "name": "orgs",
198
+ "entity_label": "organization",
199
+ "type": "list[string]",
200
+ "aggregate": "list",
201
+ },
202
+ ],
203
+ [
204
+ {"label": "person", "text": "Alice", "score": 0.9},
205
+ {"label": "person", "text": "Bob", "score": 0.8},
206
+ {"label": "organization", "text": "Acme Corp", "score": 0.85},
207
+ ],
208
+ )
209
+ result = ex.extract("text", "Alice and Bob work at Acme Corp")
210
+ assert result is not None
211
+ assert result.extracted_data["persons"] == ["Alice", "Bob"]
212
+ assert result.extracted_data["orgs"] == ["Acme Corp"]
213
+ assert result.method == EXTRACTION_METHOD_GLINER
214
+
215
+ def test_min_confidence_filters_low_score(self):
216
+ ex = self._extractor_with_mock_gliner(
217
+ [{"name": "items", "entity_label": "item", "aggregate": "list", "min_confidence": 0.8}],
218
+ [
219
+ {"label": "item", "text": "high conf", "score": 0.9},
220
+ {"label": "item", "text": "low conf", "score": 0.3},
221
+ ],
222
+ )
223
+ result = ex.extract("text", "text")
224
+ assert result is not None
225
+ assert result.extracted_data["items"] == ["high conf"]
226
+
227
+ def test_first_aggregate_takes_first(self):
228
+ ex = self._extractor_with_mock_gliner(
229
+ [{"name": "role", "entity_label": "job title", "aggregate": "first"}],
230
+ [
231
+ {"label": "job title", "text": "CEO", "score": 0.9},
232
+ {"label": "job title", "text": "CFO", "score": 0.85},
233
+ ],
234
+ )
235
+ result = ex.extract("text", "text")
236
+ assert result is not None
237
+ assert result.extracted_data["role"] == "CEO"
238
+
239
+ def test_no_entities_returns_none(self):
240
+ ex = self._extractor_with_mock_gliner(
241
+ [{"name": "dish", "entity_label": "food dish", "aggregate": "list"}],
242
+ [],
243
+ )
244
+ result = ex.extract("no food here", "no food here")
245
+ assert result is None
246
+
247
+ def test_classifier_method_uses_classifier_gliner_tag(self):
248
+ config = make_config({"name": "dish", "entity_label": "food dish", "aggregate": "list"})
249
+ ex = CustomExtractor(config, CustomDetectorMethod.CLASSIFIER)
250
+
251
+ class MockGliner:
252
+ def extract_entities(self, content: str, labels: dict[str, str], **_kwargs) -> dict:
253
+ return {
254
+ "entities": {label: [{"text": "pizza", "confidence": 0.9}] for label in labels}
255
+ }
256
+
257
+ ex._gliner_model = MockGliner()
258
+ result = ex.extract("text", "I ate pizza")
259
+ assert result is not None
260
+ assert result.method == EXTRACTION_METHOD_CLASSIFIER_GLINER
261
+
262
+ def test_required_field_gates_gliner_result(self):
263
+ ex = self._extractor_with_mock_gliner(
264
+ [
265
+ {"name": "optional_field", "entity_label": "item", "aggregate": "list"},
266
+ {
267
+ "name": "required_field",
268
+ "entity_label": "must",
269
+ "aggregate": "first",
270
+ "required": True,
271
+ },
272
+ ],
273
+ [{"label": "item", "text": "something", "score": 0.9}],
274
+ )
275
+ result = ex.extract("text", "text")
276
+ assert result is None # required_field (label "must") was not found
277
+
278
+
279
+ # ── ExtractionResult ─────────────────────────────────────────────────────────
280
+
281
+
282
+ class TestExtractionResult:
283
+ def test_populated_fields_excludes_empty_list(self):
284
+ r = ExtractionResult(extracted_data={"a": ["x"], "b": [], "c": "hello"}, method="REGEX")
285
+ assert "a" in r.populated_fields
286
+ assert "b" not in r.populated_fields
287
+ assert "c" in r.populated_fields
288
+
289
+ def test_field_count_matches_data_keys(self):
290
+ r = ExtractionResult(extracted_data={"x": 1, "y": 2, "z": 3}, method="GLINER")
291
+ assert r.field_count == 3
@@ -531,7 +531,7 @@ wheels = [
531
531
 
532
532
  [[package]]
533
533
  name = "classifyre-cli"
534
- version = "0.4.4"
534
+ version = "0.4.6"
535
535
  source = { editable = "." }
536
536
  dependencies = [
537
537
  { name = "beautifulsoup4" },
@@ -838,7 +838,7 @@ threat-ml = [
838
838
 
839
839
  [[package]]
840
840
  name = "classifyre-schemas"
841
- version = "0.4.4"
841
+ version = "0.4.6"
842
842
  source = { editable = "../../packages/schemas" }
843
843
  dependencies = [
844
844
  { name = "fastjsonschema" },
@@ -1,3 +0,0 @@
1
- $ uv sync
2
- Resolved 256 packages in 170ms
3
- Checked 49 packages in 1ms
File without changes
File without changes