classifyre-cli 0.4.3__tar.gz → 0.4.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/.turbo/turbo-build.log +1 -1
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/PKG-INFO +1 -1
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/package.json +1 -1
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/pyproject.toml +1 -1
- classifyre_cli-0.4.5/src/detectors/custom/extractor.py +261 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/main.py +32 -1
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/models/generated_input.py +64 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/pipeline/detector_pipeline.py +60 -35
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/pipeline/worker_pool.py +17 -10
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/databricks/source.py +287 -672
- classifyre_cli-0.4.5/src/sources/hive/source.py +304 -0
- classifyre_cli-0.4.5/src/sources/mssql/source.py +621 -0
- classifyre_cli-0.4.5/src/sources/mysql/source.py +303 -0
- classifyre_cli-0.4.5/src/sources/oracle/source.py +632 -0
- classifyre_cli-0.4.5/src/sources/postgresql/source.py +214 -0
- classifyre_cli-0.4.5/src/sources/snowflake/source.py +624 -0
- classifyre_cli-0.4.5/src/sources/sqlite/source.py +212 -0
- classifyre_cli-0.4.5/src/sources/tabular_base.py +793 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/tabular_utils.py +36 -0
- classifyre_cli-0.4.5/tests/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/pipeline/test_detector_pipeline.py +1 -4
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/pipeline/test_worker_pool.py +1 -0
- classifyre_cli-0.4.5/tests/test_custom_extractor.py +291 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_databricks_source.py +9 -9
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_hive_source.py +8 -8
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_mssql_source.py +5 -5
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_mysql_source.py +8 -8
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_oracle_source.py +27 -41
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_postgresql_source.py +3 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_snowflake_source.py +2 -2
- classifyre_cli-0.4.5/tests/test_sqlite_source.py +336 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/uv.lock +2 -2
- classifyre_cli-0.4.3/src/sources/hive/source.py +0 -709
- classifyre_cli-0.4.3/src/sources/mssql/source.py +0 -1034
- classifyre_cli-0.4.3/src/sources/mysql/source.py +0 -797
- classifyre_cli-0.4.3/src/sources/oracle/source.py +0 -982
- classifyre_cli-0.4.3/src/sources/postgresql/source.py +0 -774
- classifyre_cli-0.4.3/src/sources/snowflake/source.py +0 -912
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/.gitignore +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/.python-version +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/README.md +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/main.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/scripts/generate_models.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/broken_links/detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_llm.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/runners/_text_classification.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/models/generated_detectors.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/models/generated_single_asset_scan_results.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/outputs/rest.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/base.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/confluence/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/jira/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/mongodb/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/neo4j/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/neo4j/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/object_storage/base.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/powerbi/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/s3_compatible_storage/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/servicedesk/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/slack/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.3/tests → classifyre_cli-0.4.5/src/sources/sqlite}/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/tableau/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/sources/wordpress/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/utils/file_parser.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.3/tests/detectors → classifyre_cli-0.4.5/tests}/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.3/tests/detectors/content → classifyre_cli-0.4.5/tests/detectors}/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.3/tests/detectors/custom → classifyre_cli-0.4.5/tests/detectors/content}/__init__.py +0 -0
- {classifyre_cli-0.4.3/tests/detectors/pii → classifyre_cli-0.4.5/tests/detectors/custom}/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.3/tests/detectors/secrets → classifyre_cli-0.4.5/tests/detectors/pii}/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.3/tests/detectors/threat → classifyre_cli-0.4.5/tests/detectors/secrets}/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_outputs.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_s3_compatible_storage_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_source_dependency_groups.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.5}/tests/utils/test_file_parser.py +0 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""Custom detector extraction engine — REGEX, GLINER, and CLASSIFIER_GLINER strategies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from ...models.generated_detectors import (
|
|
11
|
+
CustomDetectorMethod,
|
|
12
|
+
CustomExtractorConfig,
|
|
13
|
+
CustomExtractorField,
|
|
14
|
+
)
|
|
15
|
+
from ..dependencies import MissingDependencyError, require_module
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
_DEFAULT_GLINER2_MODEL = "fastino/gliner2-base-v1"
|
|
20
|
+
|
|
21
|
+
# Extraction method tags sent to the API via DetectionResult.extraction_method
|
|
22
|
+
EXTRACTION_METHOD_REGEX = "REGEX"
|
|
23
|
+
EXTRACTION_METHOD_GLINER = "GLINER"
|
|
24
|
+
EXTRACTION_METHOD_CLASSIFIER_GLINER = "CLASSIFIER_GLINER"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ExtractionResult:
|
|
29
|
+
"""Typed output from one extraction run."""
|
|
30
|
+
|
|
31
|
+
extracted_data: dict[str, Any]
|
|
32
|
+
method: str
|
|
33
|
+
populated_fields: list[str] = field(default_factory=list)
|
|
34
|
+
field_count: int = 0
|
|
35
|
+
|
|
36
|
+
def __post_init__(self) -> None:
|
|
37
|
+
self.populated_fields = [
|
|
38
|
+
k for k, v in self.extracted_data.items() if v is not None and v not in ([], "")
|
|
39
|
+
]
|
|
40
|
+
self.field_count = len(self.extracted_data)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class CustomExtractor:
|
|
44
|
+
"""
|
|
45
|
+
Runs after a custom detector fires to pull structured data from the content.
|
|
46
|
+
|
|
47
|
+
Strategy selection:
|
|
48
|
+
RULESET → REGEX (named capture groups in field.regex_pattern)
|
|
49
|
+
ENTITY → GLINER (group GLiNER2 entity spans by entity_label into fields)
|
|
50
|
+
CLASSIFIER → CLASSIFIER_GLINER (second GLiNER2 pass on wider content slice)
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
config: CustomExtractorConfig,
|
|
56
|
+
detector_method: CustomDetectorMethod,
|
|
57
|
+
) -> None:
|
|
58
|
+
self._config = config
|
|
59
|
+
self._method = detector_method
|
|
60
|
+
self._gliner_model: Any | None = None
|
|
61
|
+
self._compiled: dict[str, re.Pattern[str]] = {} # pattern cache
|
|
62
|
+
|
|
63
|
+
# ── Public API ───────────────────────────────────────────────────────────
|
|
64
|
+
|
|
65
|
+
def extract(
|
|
66
|
+
self,
|
|
67
|
+
matched_content: str,
|
|
68
|
+
content_for_extraction: str,
|
|
69
|
+
) -> ExtractionResult | None:
|
|
70
|
+
"""
|
|
71
|
+
Run extraction and return structured result, or None if nothing extracted.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
matched_content: The content stored in the finding (may be truncated).
|
|
75
|
+
content_for_extraction: Wider slice of the original document for GLiNER/regex.
|
|
76
|
+
"""
|
|
77
|
+
if not self._config.enabled:
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
if self._method == CustomDetectorMethod.RULESET:
|
|
81
|
+
return self._extract_regex(content_for_extraction)
|
|
82
|
+
if self._method == CustomDetectorMethod.ENTITY:
|
|
83
|
+
return self._extract_gliner(content_for_extraction, EXTRACTION_METHOD_GLINER)
|
|
84
|
+
if self._method == CustomDetectorMethod.CLASSIFIER:
|
|
85
|
+
return self._extract_gliner(content_for_extraction, EXTRACTION_METHOD_CLASSIFIER_GLINER)
|
|
86
|
+
logger.warning("CustomExtractor: unknown detector method %s", self._method)
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
# ── RULESET — regex named groups ─────────────────────────────────────────
|
|
90
|
+
|
|
91
|
+
def _extract_regex(self, content: str) -> ExtractionResult | None:
|
|
92
|
+
data: dict[str, Any] = {}
|
|
93
|
+
|
|
94
|
+
for f in self._config.fields:
|
|
95
|
+
if not f.regex_pattern:
|
|
96
|
+
logger.debug(
|
|
97
|
+
"Extractor field '%s' has no regex_pattern — skipped for RULESET", f.name
|
|
98
|
+
)
|
|
99
|
+
continue
|
|
100
|
+
value = self._apply_regex_field(content, f)
|
|
101
|
+
if value is not None:
|
|
102
|
+
data[f.name] = value
|
|
103
|
+
|
|
104
|
+
return self._finalize(data, EXTRACTION_METHOD_REGEX)
|
|
105
|
+
|
|
106
|
+
def _apply_regex_field(self, content: str, f: CustomExtractorField) -> Any:
|
|
107
|
+
pattern = self._compile(f.regex_pattern or "", f.regex_flags or "i")
|
|
108
|
+
if pattern is None:
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
named_groups = pattern.groupindex
|
|
112
|
+
group_name = next(iter(named_groups), None)
|
|
113
|
+
|
|
114
|
+
matches: list[str] = []
|
|
115
|
+
for m in pattern.finditer(content):
|
|
116
|
+
captured = m.group(group_name) if group_name else m.group(0)
|
|
117
|
+
if captured and captured.strip():
|
|
118
|
+
matches.append(captured.strip())
|
|
119
|
+
|
|
120
|
+
return self._aggregate(matches, f) if matches else None
|
|
121
|
+
|
|
122
|
+
def _compile(self, pattern: str, flags_str: str) -> re.Pattern[str] | None:
|
|
123
|
+
cache_key = f"{pattern}::{flags_str}"
|
|
124
|
+
if cache_key in self._compiled:
|
|
125
|
+
return self._compiled[cache_key]
|
|
126
|
+
|
|
127
|
+
flags = 0
|
|
128
|
+
for ch in flags_str:
|
|
129
|
+
if ch == "i":
|
|
130
|
+
flags |= re.IGNORECASE
|
|
131
|
+
elif ch == "m":
|
|
132
|
+
flags |= re.MULTILINE
|
|
133
|
+
elif ch == "s":
|
|
134
|
+
flags |= re.DOTALL
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
compiled = re.compile(pattern, flags=flags)
|
|
138
|
+
self._compiled[cache_key] = compiled
|
|
139
|
+
return compiled
|
|
140
|
+
except re.error as exc:
|
|
141
|
+
logger.warning("CustomExtractor: invalid regex pattern '%s': %s", pattern, exc)
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
# ── ENTITY / CLASSIFIER — GLiNER2 entity spans ───────────────────────────
|
|
145
|
+
|
|
146
|
+
def _extract_gliner(self, content: str, method_tag: str) -> ExtractionResult | None:
|
|
147
|
+
label_to_fields: dict[str, list[CustomExtractorField]] = {}
|
|
148
|
+
for f in self._config.fields:
|
|
149
|
+
if f.entity_label:
|
|
150
|
+
label_to_fields.setdefault(f.entity_label, []).append(f)
|
|
151
|
+
|
|
152
|
+
if not label_to_fields:
|
|
153
|
+
logger.debug("CustomExtractor: no fields with entity_label — skipping GLiNER2")
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
model = self._load_gliner()
|
|
157
|
+
if model is None:
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
entity_schema = {
|
|
161
|
+
label: next(
|
|
162
|
+
(
|
|
163
|
+
field.description
|
|
164
|
+
for field in fields
|
|
165
|
+
if isinstance(field.description, str) and field.description.strip()
|
|
166
|
+
),
|
|
167
|
+
"",
|
|
168
|
+
)
|
|
169
|
+
for label, fields in label_to_fields.items()
|
|
170
|
+
}
|
|
171
|
+
try:
|
|
172
|
+
result = model.extract_entities(
|
|
173
|
+
content,
|
|
174
|
+
entity_schema,
|
|
175
|
+
threshold=0.0,
|
|
176
|
+
include_confidence=True,
|
|
177
|
+
)
|
|
178
|
+
except Exception as exc: # pragma: no cover
|
|
179
|
+
logger.warning("CustomExtractor: GLiNER2 extraction failed: %s", exc)
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
entities = result.get("entities", {})
|
|
183
|
+
if not isinstance(entities, dict):
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
data: dict[str, Any] = {}
|
|
187
|
+
for entity_label, fields in label_to_fields.items():
|
|
188
|
+
raw_spans = entities.get(entity_label, [])
|
|
189
|
+
if not isinstance(raw_spans, list):
|
|
190
|
+
raw_spans = [raw_spans]
|
|
191
|
+
|
|
192
|
+
for f in fields:
|
|
193
|
+
threshold = f.min_confidence if f.min_confidence is not None else 0.4
|
|
194
|
+
values = self._filter_gliner2_values(raw_spans, threshold)
|
|
195
|
+
value = self._aggregate(values, f) if values else None
|
|
196
|
+
if value is not None:
|
|
197
|
+
data[f.name] = value
|
|
198
|
+
|
|
199
|
+
return self._finalize(data, method_tag)
|
|
200
|
+
|
|
201
|
+
def _filter_gliner2_values(self, raw_spans: list[Any], threshold: float) -> list[str]:
|
|
202
|
+
values: list[str] = []
|
|
203
|
+
for raw_span in raw_spans:
|
|
204
|
+
if isinstance(raw_span, dict):
|
|
205
|
+
score = float(raw_span.get("confidence", raw_span.get("score", 0.0)))
|
|
206
|
+
text = str(raw_span.get("text", "")).strip()
|
|
207
|
+
else:
|
|
208
|
+
score = 1.0
|
|
209
|
+
text = str(raw_span).strip()
|
|
210
|
+
|
|
211
|
+
if score >= threshold and text:
|
|
212
|
+
values.append(text)
|
|
213
|
+
|
|
214
|
+
return values
|
|
215
|
+
|
|
216
|
+
def _load_gliner(self) -> Any | None:
|
|
217
|
+
if self._gliner_model is not None:
|
|
218
|
+
return self._gliner_model
|
|
219
|
+
try:
|
|
220
|
+
gliner2_module = require_module("gliner2", "custom", ["classification", "detectors"])
|
|
221
|
+
model_name = self._config.gliner_model or _DEFAULT_GLINER2_MODEL
|
|
222
|
+
self._gliner_model = gliner2_module.GLiNER2.from_pretrained(model_name)
|
|
223
|
+
return self._gliner_model
|
|
224
|
+
except MissingDependencyError:
|
|
225
|
+
raise
|
|
226
|
+
except Exception as exc: # pragma: no cover
|
|
227
|
+
logger.warning("CustomExtractor: failed to load GLiNER2: %s", exc)
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
# ── Shared helpers ────────────────────────────────────────────────────────
|
|
231
|
+
|
|
232
|
+
def _aggregate(self, values: list[str], f: CustomExtractorField) -> Any:
|
|
233
|
+
if not values:
|
|
234
|
+
return None
|
|
235
|
+
aggregate = f.aggregate or "list"
|
|
236
|
+
if aggregate == "first":
|
|
237
|
+
return values[0]
|
|
238
|
+
if aggregate == "last":
|
|
239
|
+
return values[-1]
|
|
240
|
+
if aggregate == "list":
|
|
241
|
+
return values
|
|
242
|
+
if aggregate == "join":
|
|
243
|
+
sep = f.join_separator if f.join_separator is not None else ", "
|
|
244
|
+
return sep.join(values)
|
|
245
|
+
if aggregate == "count":
|
|
246
|
+
return len(values)
|
|
247
|
+
return values # fallback
|
|
248
|
+
|
|
249
|
+
def _finalize(self, data: dict[str, Any], method: str) -> ExtractionResult | None:
|
|
250
|
+
# Required fields gate: if any required field is missing, discard the result
|
|
251
|
+
for f in self._config.fields:
|
|
252
|
+
if f.required and f.name not in data:
|
|
253
|
+
logger.debug(
|
|
254
|
+
"CustomExtractor: required field '%s' not populated — discarding", f.name
|
|
255
|
+
)
|
|
256
|
+
return None
|
|
257
|
+
|
|
258
|
+
if not data:
|
|
259
|
+
return None
|
|
260
|
+
|
|
261
|
+
return ExtractionResult(extracted_data=data, method=method)
|
|
@@ -243,14 +243,23 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
|
|
|
243
243
|
import asyncio as _asyncio
|
|
244
244
|
|
|
245
245
|
processed_count = 0
|
|
246
|
+
_pw = worker_pool.max_workers if worker_pool else 4
|
|
247
|
+
max_concurrent = args.max_concurrent_assets or (_pw * 2)
|
|
248
|
+
max_concurrent = max(1, max_concurrent)
|
|
249
|
+
_asset_semaphore = _asyncio.Semaphore(max_concurrent)
|
|
246
250
|
logger.info(
|
|
247
|
-
"Phase 2 starting: %d assets, pool_workers=%s",
|
|
251
|
+
"Phase 2 starting: %d assets, pool_workers=%s, max_concurrent_assets=%d",
|
|
248
252
|
len(all_stubs),
|
|
249
253
|
worker_pool.max_workers if worker_pool else "none",
|
|
254
|
+
max_concurrent,
|
|
250
255
|
)
|
|
251
256
|
error_count = 0
|
|
252
257
|
|
|
253
258
|
async def _process_one(asset: Any) -> None:
|
|
259
|
+
async with _asset_semaphore:
|
|
260
|
+
await _process_one_inner(asset)
|
|
261
|
+
|
|
262
|
+
async def _process_one_inner(asset: Any) -> None:
|
|
254
263
|
nonlocal processed_count, error_count
|
|
255
264
|
asset_hash = getattr(asset, "hash", None) or ""
|
|
256
265
|
try:
|
|
@@ -553,6 +562,12 @@ def main() -> None:
|
|
|
553
562
|
default=None,
|
|
554
563
|
help="Max OS processes in the detector pool. Auto-sized from CPU/memory when omitted (env: CLASSIFYRE_MAX_POOL_WORKERS)",
|
|
555
564
|
)
|
|
565
|
+
parser.add_argument(
|
|
566
|
+
"--max-concurrent-assets",
|
|
567
|
+
type=int,
|
|
568
|
+
default=None,
|
|
569
|
+
help="Max assets processed concurrently in Phase 2. Controls DB connection usage. Defaults to pool_workers*2 (env: CLASSIFYRE_MAX_CONCURRENT_ASSETS)",
|
|
570
|
+
)
|
|
556
571
|
|
|
557
572
|
args = parser.parse_args()
|
|
558
573
|
|
|
@@ -571,6 +586,13 @@ def main() -> None:
|
|
|
571
586
|
except ValueError:
|
|
572
587
|
args.max_pool_workers = None
|
|
573
588
|
|
|
589
|
+
if args.max_concurrent_assets is None:
|
|
590
|
+
env_val = os.environ.get("CLASSIFYRE_MAX_CONCURRENT_ASSETS")
|
|
591
|
+
try:
|
|
592
|
+
args.max_concurrent_assets = int(env_val) if env_val else None
|
|
593
|
+
except ValueError:
|
|
594
|
+
args.max_concurrent_assets = None
|
|
595
|
+
|
|
574
596
|
if args.debug:
|
|
575
597
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
576
598
|
|
|
@@ -591,6 +613,15 @@ def main() -> None:
|
|
|
591
613
|
|
|
592
614
|
recipe = load_recipe(args.recipe)
|
|
593
615
|
|
|
616
|
+
# Resolve resource overrides from recipe when CLI args / env vars are not set
|
|
617
|
+
recipe_resources = recipe.get("resources") or {}
|
|
618
|
+
if args.max_pool_workers is None and isinstance(recipe_resources.get("max_pool_workers"), int):
|
|
619
|
+
args.max_pool_workers = recipe_resources["max_pool_workers"]
|
|
620
|
+
if args.max_concurrent_assets is None and isinstance(
|
|
621
|
+
recipe_resources.get("max_concurrent_assets"), int
|
|
622
|
+
):
|
|
623
|
+
args.max_concurrent_assets = recipe_resources["max_concurrent_assets"]
|
|
624
|
+
|
|
594
625
|
source_type = recipe.get("type", "").lower()
|
|
595
626
|
if not source_type:
|
|
596
627
|
logger.error(
|
|
@@ -41,6 +41,7 @@ class AssetType(StrEnum):
|
|
|
41
41
|
CONFLUENCE = 'CONFLUENCE'
|
|
42
42
|
JIRA = 'JIRA'
|
|
43
43
|
SERVICEDESK = 'SERVICEDESK'
|
|
44
|
+
SQLITE = 'SQLITE'
|
|
44
45
|
|
|
45
46
|
|
|
46
47
|
class SourceCategory(StrEnum):
|
|
@@ -169,6 +170,12 @@ class ResourceOverrides(BaseModel):
|
|
|
169
170
|
ge=1,
|
|
170
171
|
le=16,
|
|
171
172
|
)
|
|
173
|
+
max_concurrent_assets: int | None = Field(
|
|
174
|
+
None,
|
|
175
|
+
description='Max assets processed concurrently. Controls parallel DB connections. Defaults to pool_workers * 2 when omitted.',
|
|
176
|
+
ge=1,
|
|
177
|
+
le=50,
|
|
178
|
+
)
|
|
172
179
|
|
|
173
180
|
|
|
174
181
|
class Detector(BaseModel):
|
|
@@ -1836,6 +1843,7 @@ class Type(StrEnum):
|
|
|
1836
1843
|
CONFLUENCE = 'CONFLUENCE'
|
|
1837
1844
|
JIRA = 'JIRA'
|
|
1838
1845
|
SERVICEDESK = 'SERVICEDESK'
|
|
1846
|
+
SQLITE = 'SQLITE'
|
|
1839
1847
|
|
|
1840
1848
|
|
|
1841
1849
|
class SlackInput(CoreInput):
|
|
@@ -2622,6 +2630,7 @@ class Type17(StrEnum):
|
|
|
2622
2630
|
CONFLUENCE = 'CONFLUENCE'
|
|
2623
2631
|
JIRA = 'JIRA'
|
|
2624
2632
|
SERVICEDESK = 'SERVICEDESK'
|
|
2633
|
+
SQLITE = 'SQLITE'
|
|
2625
2634
|
|
|
2626
2635
|
|
|
2627
2636
|
class ConfluenceInput(CoreInput):
|
|
@@ -2676,6 +2685,59 @@ class ServiceDeskInput(CoreInput):
|
|
|
2676
2685
|
resources: ResourceOverrides | None = None
|
|
2677
2686
|
|
|
2678
2687
|
|
|
2688
|
+
class SQLiteRequired(BaseModel):
|
|
2689
|
+
model_config = ConfigDict(
|
|
2690
|
+
extra='forbid',
|
|
2691
|
+
)
|
|
2692
|
+
database_path: str = Field(
|
|
2693
|
+
...,
|
|
2694
|
+
description='Absolute or relative path to the SQLite database file (e.g. /data/app.db)',
|
|
2695
|
+
)
|
|
2696
|
+
|
|
2697
|
+
|
|
2698
|
+
class SQLiteOptionalScope(BaseModel):
|
|
2699
|
+
"""
|
|
2700
|
+
Table selection scope.
|
|
2701
|
+
"""
|
|
2702
|
+
|
|
2703
|
+
model_config = ConfigDict(
|
|
2704
|
+
extra='forbid',
|
|
2705
|
+
)
|
|
2706
|
+
include_tables: list[str] | None = Field(
|
|
2707
|
+
None,
|
|
2708
|
+
description='Optional table allowlist. Only tables in this list will be scanned.',
|
|
2709
|
+
)
|
|
2710
|
+
table_limit: int | None = Field(
|
|
2711
|
+
None, description='Optional cap on number of table assets extracted', ge=1
|
|
2712
|
+
)
|
|
2713
|
+
|
|
2714
|
+
|
|
2715
|
+
class SQLiteOptional(BaseModel):
|
|
2716
|
+
model_config = ConfigDict(
|
|
2717
|
+
extra='forbid',
|
|
2718
|
+
)
|
|
2719
|
+
scope: SQLiteOptionalScope | None = None
|
|
2720
|
+
|
|
2721
|
+
|
|
2722
|
+
class SQLiteInput(CoreInput):
|
|
2723
|
+
type: Literal['SQLITE'] = Field('SQLITE', description='Type of the asset or source')
|
|
2724
|
+
required: SQLiteRequired
|
|
2725
|
+
masked: dict[str, Any] | None = Field(
|
|
2726
|
+
None,
|
|
2727
|
+
description='SQLite has no credentials; this section is intentionally empty.',
|
|
2728
|
+
)
|
|
2729
|
+
optional: SQLiteOptional | None = None
|
|
2730
|
+
detectors: list[Detector] | None = Field(
|
|
2731
|
+
None, description='Detectors to run on ingested content'
|
|
2732
|
+
)
|
|
2733
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
2734
|
+
None,
|
|
2735
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
2736
|
+
)
|
|
2737
|
+
sampling: SamplingConfig
|
|
2738
|
+
resources: ResourceOverrides | None = None
|
|
2739
|
+
|
|
2740
|
+
|
|
2679
2741
|
class SourceInput(
|
|
2680
2742
|
RootModel[
|
|
2681
2743
|
SlackInput
|
|
@@ -2697,6 +2759,7 @@ class SourceInput(
|
|
|
2697
2759
|
| ConfluenceInput
|
|
2698
2760
|
| JiraInput
|
|
2699
2761
|
| ServiceDeskInput
|
|
2762
|
+
| SQLiteInput
|
|
2700
2763
|
]
|
|
2701
2764
|
):
|
|
2702
2765
|
root: (
|
|
@@ -2719,6 +2782,7 @@ class SourceInput(
|
|
|
2719
2782
|
| ConfluenceInput
|
|
2720
2783
|
| JiraInput
|
|
2721
2784
|
| ServiceDeskInput
|
|
2785
|
+
| SQLiteInput
|
|
2722
2786
|
) = Field(
|
|
2723
2787
|
...,
|
|
2724
2788
|
description='Merged configuration schema with all source types and common definitions',
|