classifyre-cli 0.4.3__tar.gz → 0.4.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/.turbo/turbo-build.log +1 -1
  2. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/PKG-INFO +1 -1
  3. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/package.json +1 -1
  4. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/pyproject.toml +1 -1
  5. classifyre_cli-0.4.5/src/detectors/custom/extractor.py +261 -0
  6. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/main.py +32 -1
  7. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/models/generated_input.py +64 -0
  8. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/pipeline/detector_pipeline.py +60 -35
  9. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/pipeline/worker_pool.py +17 -10
  10. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/databricks/source.py +287 -672
  11. classifyre_cli-0.4.5/src/sources/hive/source.py +304 -0
  12. classifyre_cli-0.4.5/src/sources/mssql/source.py +621 -0
  13. classifyre_cli-0.4.5/src/sources/mysql/source.py +303 -0
  14. classifyre_cli-0.4.5/src/sources/oracle/source.py +632 -0
  15. classifyre_cli-0.4.5/src/sources/postgresql/source.py +214 -0
  16. classifyre_cli-0.4.5/src/sources/snowflake/source.py +624 -0
  17. classifyre_cli-0.4.5/src/sources/sqlite/source.py +212 -0
  18. classifyre_cli-0.4.5/src/sources/tabular_base.py +793 -0
  19. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/tabular_utils.py +36 -0
  20. classifyre_cli-0.4.5/tests/detectors/threat/__init__.py +0 -0
  21. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/pipeline/test_detector_pipeline.py +1 -4
  22. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/pipeline/test_worker_pool.py +1 -0
  23. classifyre_cli-0.4.5/tests/test_custom_extractor.py +291 -0
  24. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_databricks_source.py +9 -9
  25. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_hive_source.py +8 -8
  26. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_mssql_source.py +5 -5
  27. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_mysql_source.py +8 -8
  28. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_oracle_source.py +27 -41
  29. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_postgresql_source.py +3 -0
  30. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_snowflake_source.py +2 -2
  31. classifyre_cli-0.4.5/tests/test_sqlite_source.py +336 -0
  32. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/uv.lock +2 -2
  33. classifyre_cli-0.4.3/src/sources/hive/source.py +0 -709
  34. classifyre_cli-0.4.3/src/sources/mssql/source.py +0 -1034
  35. classifyre_cli-0.4.3/src/sources/mysql/source.py +0 -797
  36. classifyre_cli-0.4.3/src/sources/oracle/source.py +0 -982
  37. classifyre_cli-0.4.3/src/sources/postgresql/source.py +0 -774
  38. classifyre_cli-0.4.3/src/sources/snowflake/source.py +0 -912
  39. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/.gitignore +0 -0
  40. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/.python-version +0 -0
  41. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/README.md +0 -0
  42. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/main.py +0 -0
  43. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/scripts/generate_models.py +0 -0
  44. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/__init__.py +0 -0
  45. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/__init__.py +0 -0
  46. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/base.py +0 -0
  47. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/broken_links/__init__.py +0 -0
  48. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/broken_links/detector.py +0 -0
  49. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/config.py +0 -0
  50. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/content/__init__.py +0 -0
  51. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/__init__.py +0 -0
  52. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/detector.py +0 -0
  53. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/__init__.py +0 -0
  54. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_base.py +0 -0
  55. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_factory.py +0 -0
  56. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_feature_extraction.py +0 -0
  57. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_gliner2.py +0 -0
  58. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_image_classification.py +0 -0
  59. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_llm.py +0 -0
  60. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_object_detection.py +0 -0
  61. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_regex.py +0 -0
  62. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_text_classification.py +0 -0
  63. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/trainer.py +0 -0
  64. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/dependencies.py +0 -0
  65. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/pii/__init__.py +0 -0
  66. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/pii/detector.py +0 -0
  67. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/secrets/__init__.py +0 -0
  68. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/secrets/detector.py +0 -0
  69. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/threat/__init__.py +0 -0
  70. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/threat/code_security_detector.py +0 -0
  71. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/threat/yara_detector.py +0 -0
  72. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/models/generated_detectors.py +0 -0
  73. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/models/generated_single_asset_scan_results.py +0 -0
  74. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/outputs/__init__.py +0 -0
  75. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/outputs/base.py +0 -0
  76. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/outputs/console.py +0 -0
  77. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/outputs/factory.py +0 -0
  78. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/outputs/file.py +0 -0
  79. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/outputs/rest.py +0 -0
  80. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/pipeline/__init__.py +0 -0
  81. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/pipeline/content_provider.py +0 -0
  82. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/pipeline/parsed_content_provider.py +0 -0
  83. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sandbox/__init__.py +0 -0
  84. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sandbox/runner.py +0 -0
  85. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/__init__.py +0 -0
  86. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/atlassian_common.py +0 -0
  87. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/azure_blob_storage/__init__.py +0 -0
  88. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/azure_blob_storage/source.py +0 -0
  89. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/base.py +0 -0
  90. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/confluence/__init__.py +0 -0
  91. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/confluence/source.py +0 -0
  92. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/databricks/__init__.py +0 -0
  93. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/dependencies.py +0 -0
  94. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/google_cloud_storage/__init__.py +0 -0
  95. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/google_cloud_storage/source.py +0 -0
  96. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/hive/__init__.py +0 -0
  97. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/jira/__init__.py +0 -0
  98. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/jira/source.py +0 -0
  99. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/mongodb/__init__.py +0 -0
  100. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/mongodb/source.py +0 -0
  101. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/mssql/__init__.py +0 -0
  102. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/mysql/__init__.py +0 -0
  103. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/neo4j/__init__.py +0 -0
  104. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/neo4j/source.py +0 -0
  105. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/object_storage/base.py +0 -0
  106. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/oracle/__init__.py +0 -0
  107. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/postgresql/__init__.py +0 -0
  108. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/powerbi/__init__.py +0 -0
  109. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/powerbi/source.py +0 -0
  110. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/recipe_normalizer.py +0 -0
  111. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/s3_compatible_storage/README.md +0 -0
  112. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/s3_compatible_storage/__init__.py +0 -0
  113. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/s3_compatible_storage/source.py +0 -0
  114. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/servicedesk/__init__.py +0 -0
  115. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/servicedesk/source.py +0 -0
  116. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/slack/__init__.py +0 -0
  117. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/slack/source.py +0 -0
  118. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/snowflake/__init__.py +0 -0
  119. {classifyre_cli-0.4.3/tests → classifyre_cli-0.4.5/src/sources/sqlite}/__init__.py +0 -0
  120. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/tableau/__init__.py +0 -0
  121. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/tableau/source.py +0 -0
  122. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/wordpress/__init__.py +0 -0
  123. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/wordpress/source.py +0 -0
  124. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/telemetry.py +0 -0
  125. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/utils/__init__.py +0 -0
  126. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/utils/content_extraction.py +0 -0
  127. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/utils/file_parser.py +0 -0
  128. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/utils/hashing.py +0 -0
  129. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/utils/uv_sync.py +0 -0
  130. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/utils/validation.py +0 -0
  131. {classifyre_cli-0.4.3/tests/detectors → classifyre_cli-0.4.5/tests}/__init__.py +0 -0
  132. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/conftest.py +0 -0
  133. {classifyre_cli-0.4.3/tests/detectors/content → classifyre_cli-0.4.5/tests/detectors}/__init__.py +0 -0
  134. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
  135. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/conftest.py +0 -0
  136. {classifyre_cli-0.4.3/tests/detectors/custom → classifyre_cli-0.4.5/tests/detectors/content}/__init__.py +0 -0
  137. {classifyre_cli-0.4.3/tests/detectors/pii → classifyre_cli-0.4.5/tests/detectors/custom}/__init__.py +0 -0
  138. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/custom/conftest.py +0 -0
  139. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/custom/test_invoice_extraction.py +0 -0
  140. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/custom/test_pipeline_integration.py +0 -0
  141. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/custom/test_regex_runner.py +0 -0
  142. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/custom/test_transformer_runners.py +0 -0
  143. {classifyre_cli-0.4.3/tests/detectors/secrets → classifyre_cli-0.4.5/tests/detectors/pii}/__init__.py +0 -0
  144. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/pii/conftest.py +0 -0
  145. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/pii/sample_invoice.pdf +0 -0
  146. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/pii/test_pii_detector.py +0 -0
  147. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
  148. {classifyre_cli-0.4.3/tests/detectors/threat → classifyre_cli-0.4.5/tests/detectors/secrets}/__init__.py +0 -0
  149. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/secrets/test_secrets_detector.py +0 -0
  150. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
  151. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_base_detector.py +0 -0
  152. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
  153. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_detector_catalog_commercial.py +0 -0
  154. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_detector_pipeline_types.py +0 -0
  155. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_detector_schema_examples.py +0 -0
  156. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_detector_types.py +0 -0
  157. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_phase2_detectors.py +0 -0
  158. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_registry.py +0 -0
  159. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/threat/test_code_security_detector.py +0 -0
  160. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/threat/test_yara_detector.py +0 -0
  161. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
  162. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/integration/test_wordpress_links_assets.py +0 -0
  163. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_azure_blob_storage_source.py +0 -0
  164. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_base_source_attachment.py +0 -0
  165. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_base_source_sampling.py +0 -0
  166. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_confluence_source.py +0 -0
  167. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_google_cloud_storage_source.py +0 -0
  168. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_hashing.py +0 -0
  169. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_jira_source.py +0 -0
  170. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_mongodb_source.py +0 -0
  171. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_neo4j_source.py +0 -0
  172. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_outputs.py +0 -0
  173. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_powerbi_source.py +0 -0
  174. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_recipe_normalizer.py +0 -0
  175. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_s3_compatible_storage_source.py +0 -0
  176. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_servicedesk_source.py +0 -0
  177. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_slack_source.py +0 -0
  178. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_source_dependency_groups.py +0 -0
  179. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_tableau_source.py +0 -0
  180. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_tabular_utils.py +0 -0
  181. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_wordpress_source.py +0 -0
  182. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/utils/test_content_extraction.py +0 -0
  183. {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/utils/test_file_parser.py +0 -0
@@ -1,3 +1,3 @@
1
1
  $ uv sync
2
- Resolved 256 packages in 159ms
2
+ Resolved 256 packages in 203ms
3
3
  Checked 49 packages in 1ms
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: classifyre-cli
3
- Version: 0.4.3
3
+ Version: 0.4.5
4
4
  Summary: Classifyre CLI — scan and classify unstructured data sources
5
5
  License: MIT
6
6
  Keywords: data,ingestion,metadata,pii,secrets,unstructured
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@classifyre/cli",
3
- "version": "0.4.3",
3
+ "version": "0.4.5",
4
4
  "private": true,
5
5
  "scripts": {
6
6
  "build": "uv sync",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "classifyre-cli"
3
- version = "0.4.3"
3
+ version = "0.4.5"
4
4
  description = "Classifyre CLI — scan and classify unstructured data sources"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -0,0 +1,261 @@
1
+ """Custom detector extraction engine — REGEX, GLINER, and CLASSIFIER_GLINER strategies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from dataclasses import dataclass, field
8
+ from typing import Any
9
+
10
+ from ...models.generated_detectors import (
11
+ CustomDetectorMethod,
12
+ CustomExtractorConfig,
13
+ CustomExtractorField,
14
+ )
15
+ from ..dependencies import MissingDependencyError, require_module
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ _DEFAULT_GLINER2_MODEL = "fastino/gliner2-base-v1"
20
+
21
+ # Extraction method tags sent to the API via DetectionResult.extraction_method
22
+ EXTRACTION_METHOD_REGEX = "REGEX"
23
+ EXTRACTION_METHOD_GLINER = "GLINER"
24
+ EXTRACTION_METHOD_CLASSIFIER_GLINER = "CLASSIFIER_GLINER"
25
+
26
+
27
+ @dataclass
28
+ class ExtractionResult:
29
+ """Typed output from one extraction run."""
30
+
31
+ extracted_data: dict[str, Any]
32
+ method: str
33
+ populated_fields: list[str] = field(default_factory=list)
34
+ field_count: int = 0
35
+
36
+ def __post_init__(self) -> None:
37
+ self.populated_fields = [
38
+ k for k, v in self.extracted_data.items() if v is not None and v not in ([], "")
39
+ ]
40
+ self.field_count = len(self.extracted_data)
41
+
42
+
43
+ class CustomExtractor:
44
+ """
45
+ Runs after a custom detector fires to pull structured data from the content.
46
+
47
+ Strategy selection:
48
+ RULESET → REGEX (named capture groups in field.regex_pattern)
49
+ ENTITY → GLINER (group GLiNER2 entity spans by entity_label into fields)
50
+ CLASSIFIER → CLASSIFIER_GLINER (second GLiNER2 pass on wider content slice)
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ config: CustomExtractorConfig,
56
+ detector_method: CustomDetectorMethod,
57
+ ) -> None:
58
+ self._config = config
59
+ self._method = detector_method
60
+ self._gliner_model: Any | None = None
61
+ self._compiled: dict[str, re.Pattern[str]] = {} # pattern cache
62
+
63
+ # ── Public API ───────────────────────────────────────────────────────────
64
+
65
+ def extract(
66
+ self,
67
+ matched_content: str,
68
+ content_for_extraction: str,
69
+ ) -> ExtractionResult | None:
70
+ """
71
+ Run extraction and return structured result, or None if nothing extracted.
72
+
73
+ Args:
74
+ matched_content: The content stored in the finding (may be truncated).
75
+ content_for_extraction: Wider slice of the original document for GLiNER/regex.
76
+ """
77
+ if not self._config.enabled:
78
+ return None
79
+
80
+ if self._method == CustomDetectorMethod.RULESET:
81
+ return self._extract_regex(content_for_extraction)
82
+ if self._method == CustomDetectorMethod.ENTITY:
83
+ return self._extract_gliner(content_for_extraction, EXTRACTION_METHOD_GLINER)
84
+ if self._method == CustomDetectorMethod.CLASSIFIER:
85
+ return self._extract_gliner(content_for_extraction, EXTRACTION_METHOD_CLASSIFIER_GLINER)
86
+ logger.warning("CustomExtractor: unknown detector method %s", self._method)
87
+ return None
88
+
89
+ # ── RULESET — regex named groups ─────────────────────────────────────────
90
+
91
+ def _extract_regex(self, content: str) -> ExtractionResult | None:
92
+ data: dict[str, Any] = {}
93
+
94
+ for f in self._config.fields:
95
+ if not f.regex_pattern:
96
+ logger.debug(
97
+ "Extractor field '%s' has no regex_pattern — skipped for RULESET", f.name
98
+ )
99
+ continue
100
+ value = self._apply_regex_field(content, f)
101
+ if value is not None:
102
+ data[f.name] = value
103
+
104
+ return self._finalize(data, EXTRACTION_METHOD_REGEX)
105
+
106
+ def _apply_regex_field(self, content: str, f: CustomExtractorField) -> Any:
107
+ pattern = self._compile(f.regex_pattern or "", f.regex_flags or "i")
108
+ if pattern is None:
109
+ return None
110
+
111
+ named_groups = pattern.groupindex
112
+ group_name = next(iter(named_groups), None)
113
+
114
+ matches: list[str] = []
115
+ for m in pattern.finditer(content):
116
+ captured = m.group(group_name) if group_name else m.group(0)
117
+ if captured and captured.strip():
118
+ matches.append(captured.strip())
119
+
120
+ return self._aggregate(matches, f) if matches else None
121
+
122
+ def _compile(self, pattern: str, flags_str: str) -> re.Pattern[str] | None:
123
+ cache_key = f"{pattern}::{flags_str}"
124
+ if cache_key in self._compiled:
125
+ return self._compiled[cache_key]
126
+
127
+ flags = 0
128
+ for ch in flags_str:
129
+ if ch == "i":
130
+ flags |= re.IGNORECASE
131
+ elif ch == "m":
132
+ flags |= re.MULTILINE
133
+ elif ch == "s":
134
+ flags |= re.DOTALL
135
+
136
+ try:
137
+ compiled = re.compile(pattern, flags=flags)
138
+ self._compiled[cache_key] = compiled
139
+ return compiled
140
+ except re.error as exc:
141
+ logger.warning("CustomExtractor: invalid regex pattern '%s': %s", pattern, exc)
142
+ return None
143
+
144
+ # ── ENTITY / CLASSIFIER — GLiNER2 entity spans ───────────────────────────
145
+
146
+ def _extract_gliner(self, content: str, method_tag: str) -> ExtractionResult | None:
147
+ label_to_fields: dict[str, list[CustomExtractorField]] = {}
148
+ for f in self._config.fields:
149
+ if f.entity_label:
150
+ label_to_fields.setdefault(f.entity_label, []).append(f)
151
+
152
+ if not label_to_fields:
153
+ logger.debug("CustomExtractor: no fields with entity_label — skipping GLiNER2")
154
+ return None
155
+
156
+ model = self._load_gliner()
157
+ if model is None:
158
+ return None
159
+
160
+ entity_schema = {
161
+ label: next(
162
+ (
163
+ field.description
164
+ for field in fields
165
+ if isinstance(field.description, str) and field.description.strip()
166
+ ),
167
+ "",
168
+ )
169
+ for label, fields in label_to_fields.items()
170
+ }
171
+ try:
172
+ result = model.extract_entities(
173
+ content,
174
+ entity_schema,
175
+ threshold=0.0,
176
+ include_confidence=True,
177
+ )
178
+ except Exception as exc: # pragma: no cover
179
+ logger.warning("CustomExtractor: GLiNER2 extraction failed: %s", exc)
180
+ return None
181
+
182
+ entities = result.get("entities", {})
183
+ if not isinstance(entities, dict):
184
+ return None
185
+
186
+ data: dict[str, Any] = {}
187
+ for entity_label, fields in label_to_fields.items():
188
+ raw_spans = entities.get(entity_label, [])
189
+ if not isinstance(raw_spans, list):
190
+ raw_spans = [raw_spans]
191
+
192
+ for f in fields:
193
+ threshold = f.min_confidence if f.min_confidence is not None else 0.4
194
+ values = self._filter_gliner2_values(raw_spans, threshold)
195
+ value = self._aggregate(values, f) if values else None
196
+ if value is not None:
197
+ data[f.name] = value
198
+
199
+ return self._finalize(data, method_tag)
200
+
201
+ def _filter_gliner2_values(self, raw_spans: list[Any], threshold: float) -> list[str]:
202
+ values: list[str] = []
203
+ for raw_span in raw_spans:
204
+ if isinstance(raw_span, dict):
205
+ score = float(raw_span.get("confidence", raw_span.get("score", 0.0)))
206
+ text = str(raw_span.get("text", "")).strip()
207
+ else:
208
+ score = 1.0
209
+ text = str(raw_span).strip()
210
+
211
+ if score >= threshold and text:
212
+ values.append(text)
213
+
214
+ return values
215
+
216
+ def _load_gliner(self) -> Any | None:
217
+ if self._gliner_model is not None:
218
+ return self._gliner_model
219
+ try:
220
+ gliner2_module = require_module("gliner2", "custom", ["classification", "detectors"])
221
+ model_name = self._config.gliner_model or _DEFAULT_GLINER2_MODEL
222
+ self._gliner_model = gliner2_module.GLiNER2.from_pretrained(model_name)
223
+ return self._gliner_model
224
+ except MissingDependencyError:
225
+ raise
226
+ except Exception as exc: # pragma: no cover
227
+ logger.warning("CustomExtractor: failed to load GLiNER2: %s", exc)
228
+ return None
229
+
230
+ # ── Shared helpers ────────────────────────────────────────────────────────
231
+
232
+ def _aggregate(self, values: list[str], f: CustomExtractorField) -> Any:
233
+ if not values:
234
+ return None
235
+ aggregate = f.aggregate or "list"
236
+ if aggregate == "first":
237
+ return values[0]
238
+ if aggregate == "last":
239
+ return values[-1]
240
+ if aggregate == "list":
241
+ return values
242
+ if aggregate == "join":
243
+ sep = f.join_separator if f.join_separator is not None else ", "
244
+ return sep.join(values)
245
+ if aggregate == "count":
246
+ return len(values)
247
+ return values # fallback
248
+
249
+ def _finalize(self, data: dict[str, Any], method: str) -> ExtractionResult | None:
250
+ # Required fields gate: if any required field is missing, discard the result
251
+ for f in self._config.fields:
252
+ if f.required and f.name not in data:
253
+ logger.debug(
254
+ "CustomExtractor: required field '%s' not populated — discarding", f.name
255
+ )
256
+ return None
257
+
258
+ if not data:
259
+ return None
260
+
261
+ return ExtractionResult(extracted_data=data, method=method)
@@ -243,14 +243,23 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
243
243
  import asyncio as _asyncio
244
244
 
245
245
  processed_count = 0
246
+ _pw = worker_pool.max_workers if worker_pool else 4
247
+ max_concurrent = args.max_concurrent_assets or (_pw * 2)
248
+ max_concurrent = max(1, max_concurrent)
249
+ _asset_semaphore = _asyncio.Semaphore(max_concurrent)
246
250
  logger.info(
247
- "Phase 2 starting: %d assets, pool_workers=%s",
251
+ "Phase 2 starting: %d assets, pool_workers=%s, max_concurrent_assets=%d",
248
252
  len(all_stubs),
249
253
  worker_pool.max_workers if worker_pool else "none",
254
+ max_concurrent,
250
255
  )
251
256
  error_count = 0
252
257
 
253
258
  async def _process_one(asset: Any) -> None:
259
+ async with _asset_semaphore:
260
+ await _process_one_inner(asset)
261
+
262
+ async def _process_one_inner(asset: Any) -> None:
254
263
  nonlocal processed_count, error_count
255
264
  asset_hash = getattr(asset, "hash", None) or ""
256
265
  try:
@@ -553,6 +562,12 @@ def main() -> None:
553
562
  default=None,
554
563
  help="Max OS processes in the detector pool. Auto-sized from CPU/memory when omitted (env: CLASSIFYRE_MAX_POOL_WORKERS)",
555
564
  )
565
+ parser.add_argument(
566
+ "--max-concurrent-assets",
567
+ type=int,
568
+ default=None,
569
+ help="Max assets processed concurrently in Phase 2. Controls DB connection usage. Defaults to pool_workers*2 (env: CLASSIFYRE_MAX_CONCURRENT_ASSETS)",
570
+ )
556
571
 
557
572
  args = parser.parse_args()
558
573
 
@@ -571,6 +586,13 @@ def main() -> None:
571
586
  except ValueError:
572
587
  args.max_pool_workers = None
573
588
 
589
+ if args.max_concurrent_assets is None:
590
+ env_val = os.environ.get("CLASSIFYRE_MAX_CONCURRENT_ASSETS")
591
+ try:
592
+ args.max_concurrent_assets = int(env_val) if env_val else None
593
+ except ValueError:
594
+ args.max_concurrent_assets = None
595
+
574
596
  if args.debug:
575
597
  logging.getLogger().setLevel(logging.DEBUG)
576
598
 
@@ -591,6 +613,15 @@ def main() -> None:
591
613
 
592
614
  recipe = load_recipe(args.recipe)
593
615
 
616
+ # Resolve resource overrides from recipe when CLI args / env vars are not set
617
+ recipe_resources = recipe.get("resources") or {}
618
+ if args.max_pool_workers is None and isinstance(recipe_resources.get("max_pool_workers"), int):
619
+ args.max_pool_workers = recipe_resources["max_pool_workers"]
620
+ if args.max_concurrent_assets is None and isinstance(
621
+ recipe_resources.get("max_concurrent_assets"), int
622
+ ):
623
+ args.max_concurrent_assets = recipe_resources["max_concurrent_assets"]
624
+
594
625
  source_type = recipe.get("type", "").lower()
595
626
  if not source_type:
596
627
  logger.error(
@@ -41,6 +41,7 @@ class AssetType(StrEnum):
41
41
  CONFLUENCE = 'CONFLUENCE'
42
42
  JIRA = 'JIRA'
43
43
  SERVICEDESK = 'SERVICEDESK'
44
+ SQLITE = 'SQLITE'
44
45
 
45
46
 
46
47
  class SourceCategory(StrEnum):
@@ -169,6 +170,12 @@ class ResourceOverrides(BaseModel):
169
170
  ge=1,
170
171
  le=16,
171
172
  )
173
+ max_concurrent_assets: int | None = Field(
174
+ None,
175
+ description='Max assets processed concurrently. Controls parallel DB connections. Defaults to pool_workers * 2 when omitted.',
176
+ ge=1,
177
+ le=50,
178
+ )
172
179
 
173
180
 
174
181
  class Detector(BaseModel):
@@ -1836,6 +1843,7 @@ class Type(StrEnum):
1836
1843
  CONFLUENCE = 'CONFLUENCE'
1837
1844
  JIRA = 'JIRA'
1838
1845
  SERVICEDESK = 'SERVICEDESK'
1846
+ SQLITE = 'SQLITE'
1839
1847
 
1840
1848
 
1841
1849
  class SlackInput(CoreInput):
@@ -2622,6 +2630,7 @@ class Type17(StrEnum):
2622
2630
  CONFLUENCE = 'CONFLUENCE'
2623
2631
  JIRA = 'JIRA'
2624
2632
  SERVICEDESK = 'SERVICEDESK'
2633
+ SQLITE = 'SQLITE'
2625
2634
 
2626
2635
 
2627
2636
  class ConfluenceInput(CoreInput):
@@ -2676,6 +2685,59 @@ class ServiceDeskInput(CoreInput):
2676
2685
  resources: ResourceOverrides | None = None
2677
2686
 
2678
2687
 
2688
+ class SQLiteRequired(BaseModel):
2689
+ model_config = ConfigDict(
2690
+ extra='forbid',
2691
+ )
2692
+ database_path: str = Field(
2693
+ ...,
2694
+ description='Absolute or relative path to the SQLite database file (e.g. /data/app.db)',
2695
+ )
2696
+
2697
+
2698
+ class SQLiteOptionalScope(BaseModel):
2699
+ """
2700
+ Table selection scope.
2701
+ """
2702
+
2703
+ model_config = ConfigDict(
2704
+ extra='forbid',
2705
+ )
2706
+ include_tables: list[str] | None = Field(
2707
+ None,
2708
+ description='Optional table allowlist. Only tables in this list will be scanned.',
2709
+ )
2710
+ table_limit: int | None = Field(
2711
+ None, description='Optional cap on number of table assets extracted', ge=1
2712
+ )
2713
+
2714
+
2715
+ class SQLiteOptional(BaseModel):
2716
+ model_config = ConfigDict(
2717
+ extra='forbid',
2718
+ )
2719
+ scope: SQLiteOptionalScope | None = None
2720
+
2721
+
2722
+ class SQLiteInput(CoreInput):
2723
+ type: Literal['SQLITE'] = Field('SQLITE', description='Type of the asset or source')
2724
+ required: SQLiteRequired
2725
+ masked: dict[str, Any] | None = Field(
2726
+ None,
2727
+ description='SQLite has no credentials; this section is intentionally empty.',
2728
+ )
2729
+ optional: SQLiteOptional | None = None
2730
+ detectors: list[Detector] | None = Field(
2731
+ None, description='Detectors to run on ingested content'
2732
+ )
2733
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2734
+ None,
2735
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2736
+ )
2737
+ sampling: SamplingConfig
2738
+ resources: ResourceOverrides | None = None
2739
+
2740
+
2679
2741
  class SourceInput(
2680
2742
  RootModel[
2681
2743
  SlackInput
@@ -2697,6 +2759,7 @@ class SourceInput(
2697
2759
  | ConfluenceInput
2698
2760
  | JiraInput
2699
2761
  | ServiceDeskInput
2762
+ | SQLiteInput
2700
2763
  ]
2701
2764
  ):
2702
2765
  root: (
@@ -2719,6 +2782,7 @@ class SourceInput(
2719
2782
  | ConfluenceInput
2720
2783
  | JiraInput
2721
2784
  | ServiceDeskInput
2785
+ | SQLiteInput
2722
2786
  ) = Field(
2723
2787
  ...,
2724
2788
  description='Merged configuration schema with all source types and common definitions',