classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Feature extraction (dense embeddings) pipeline runner."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ....models.generated_detectors import FeatureExtractionPipelineSchema, Severity
|
|
9
|
+
from ....models.generated_single_asset_scan_results import DetectionResult
|
|
10
|
+
from ...dependencies import ensure_torch, require_module
|
|
11
|
+
from ._base import _TEXT_CONTENT_TYPES, BaseRunner
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _chunk_text_with_offsets(
|
|
17
|
+
text: str, chunk_size: int | None, chunk_overlap: int
|
|
18
|
+
) -> list[tuple[str, int]]:
|
|
19
|
+
"""Return (chunk, char_offset) pairs. Returns [(text, 0)] when chunk_size is not set."""
|
|
20
|
+
if not chunk_size:
|
|
21
|
+
return [(text, 0)]
|
|
22
|
+
step = max(1, chunk_size - chunk_overlap)
|
|
23
|
+
return [(text[i : i + chunk_size], i) for i in range(0, len(text), step)]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _pool_hidden(
|
|
27
|
+
hidden: list[list[float]], pooling: str, normalize: bool
|
|
28
|
+
) -> list[float] | list[list[float]] | None:
|
|
29
|
+
"""Apply pooling strategy to per-token hidden states."""
|
|
30
|
+
try:
|
|
31
|
+
import numpy as np # type: ignore[import-untyped]
|
|
32
|
+
except ImportError:
|
|
33
|
+
logger.warning("numpy is required for feature extraction pooling")
|
|
34
|
+
return None
|
|
35
|
+
arr = np.array(hidden, dtype=np.float32)
|
|
36
|
+
if pooling == "cls":
|
|
37
|
+
vector = arr[0]
|
|
38
|
+
elif pooling == "max":
|
|
39
|
+
vector = arr.max(axis=0)
|
|
40
|
+
elif pooling == "none":
|
|
41
|
+
return arr.tolist() # type: ignore[no-any-return]
|
|
42
|
+
else:
|
|
43
|
+
vector = arr.mean(axis=0)
|
|
44
|
+
if normalize:
|
|
45
|
+
norm = float(np.linalg.norm(vector))
|
|
46
|
+
if norm > 0:
|
|
47
|
+
vector = vector / norm
|
|
48
|
+
return vector.tolist() # type: ignore[no-any-return]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class FeatureExtractionRunner(BaseRunner):
|
|
52
|
+
"""Dense vector embeddings via a single HuggingFace feature-extraction pipeline."""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
schema: FeatureExtractionPipelineSchema,
|
|
57
|
+
detector_key: str = "",
|
|
58
|
+
detector_name: str = "",
|
|
59
|
+
) -> None:
|
|
60
|
+
self._schema = schema
|
|
61
|
+
self._detector_key = detector_key
|
|
62
|
+
self._detector_name = detector_name
|
|
63
|
+
ensure_torch("feature_extraction", ["custom", "detectors"])
|
|
64
|
+
transformers = require_module("transformers", "feature_extraction", ["custom", "detectors"])
|
|
65
|
+
truncation = schema.truncation if schema.truncation is not None else True
|
|
66
|
+
tokenizer_kwargs: dict[str, Any] = {"truncation": truncation}
|
|
67
|
+
if schema.max_length is not None:
|
|
68
|
+
tokenizer_kwargs["max_length"] = schema.max_length
|
|
69
|
+
pipeline_kwargs: dict[str, Any] = {
|
|
70
|
+
"model": schema.model,
|
|
71
|
+
"device": schema.device or "cpu",
|
|
72
|
+
"tokenizer_kwargs": tokenizer_kwargs,
|
|
73
|
+
}
|
|
74
|
+
if schema.model_revision:
|
|
75
|
+
pipeline_kwargs["revision"] = schema.model_revision
|
|
76
|
+
self._pipe: Any = transformers.pipeline("feature-extraction", **pipeline_kwargs)
|
|
77
|
+
|
|
78
|
+
def run(self, text: str) -> None: # type: ignore[override] # pragma: no cover
|
|
79
|
+
raise NotImplementedError("FeatureExtractionRunner uses detect() directly")
|
|
80
|
+
|
|
81
|
+
def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
|
|
82
|
+
if isinstance(content, bytes):
|
|
83
|
+
return []
|
|
84
|
+
if content_type not in _TEXT_CONTENT_TYPES:
|
|
85
|
+
return []
|
|
86
|
+
text = content.strip()
|
|
87
|
+
if not text:
|
|
88
|
+
return []
|
|
89
|
+
|
|
90
|
+
schema = self._schema
|
|
91
|
+
pooling = str(schema.pooling_strategy or "mean")
|
|
92
|
+
normalize = schema.normalize_embeddings if schema.normalize_embeddings is not None else True
|
|
93
|
+
chunk_size: int | None = getattr(schema.chunk_size, "root", schema.chunk_size)
|
|
94
|
+
chunk_overlap: int = getattr(schema.chunk_overlap, "root", schema.chunk_overlap) or 0
|
|
95
|
+
|
|
96
|
+
results: list[DetectionResult] = []
|
|
97
|
+
try:
|
|
98
|
+
for chunk, offset in _chunk_text_with_offsets(text, chunk_size, chunk_overlap):
|
|
99
|
+
raw: list[list[list[float]]] = self._pipe(chunk) or []
|
|
100
|
+
if not raw or not raw[0]:
|
|
101
|
+
continue
|
|
102
|
+
embedding = _pool_hidden(raw[0], pooling, normalize)
|
|
103
|
+
if embedding is None:
|
|
104
|
+
continue
|
|
105
|
+
dim: int | None
|
|
106
|
+
if pooling == "none":
|
|
107
|
+
dim = (
|
|
108
|
+
len(embedding[0]) if embedding and isinstance(embedding[0], list) else None
|
|
109
|
+
) # type: ignore[index]
|
|
110
|
+
else:
|
|
111
|
+
dim = len(embedding) # type: ignore[arg-type]
|
|
112
|
+
results.append(
|
|
113
|
+
self._make_result(
|
|
114
|
+
finding_type="embedding",
|
|
115
|
+
category="CLASSIFICATION",
|
|
116
|
+
severity=Severity.info,
|
|
117
|
+
confidence=1.0,
|
|
118
|
+
matched_content=chunk[:256],
|
|
119
|
+
location=None,
|
|
120
|
+
metadata={
|
|
121
|
+
"embedding": embedding,
|
|
122
|
+
"dimension": dim,
|
|
123
|
+
"pooling_strategy": pooling,
|
|
124
|
+
"normalized": normalize,
|
|
125
|
+
"model": schema.model,
|
|
126
|
+
"chunk_offset": offset,
|
|
127
|
+
"chunk_length": len(chunk),
|
|
128
|
+
},
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
except Exception as exc:
|
|
132
|
+
logger.error(
|
|
133
|
+
"feature_extraction error (model=%s): %s", schema.model, exc, exc_info=True
|
|
134
|
+
)
|
|
135
|
+
return results
|
|
136
|
+
|
|
137
|
+
def get_supported_content_types(self) -> list[str]:
|
|
138
|
+
return list(_TEXT_CONTENT_TYPES)
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
"""GLiNER2 pipeline runner."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import re
|
|
8
|
+
import time
|
|
9
|
+
from datetime import UTC, datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from ....models.generated_detectors import (
|
|
14
|
+
GLiNER2PipelineSchema,
|
|
15
|
+
PipelineEntityDefinition,
|
|
16
|
+
PipelineResult,
|
|
17
|
+
PipelineValidationConfig,
|
|
18
|
+
)
|
|
19
|
+
from ...dependencies import MissingDependencyError, require_module
|
|
20
|
+
from ._base import _DEFAULT_GLINER2_MODEL, BaseRunner
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class GLiNER2Runner(BaseRunner):
|
|
26
|
+
"""Execute a GLiNER2 pipeline: single-model pass for entities + classification.
|
|
27
|
+
|
|
28
|
+
When the pipeline schema's model.path points to a trained artifact directory
|
|
29
|
+
(written by trainer.py), the runner:
|
|
30
|
+
- loads the fine-tuned GLiNER2 weights from <path>/gliner2/ if present
|
|
31
|
+
- uses per-task SetFit models from <path>/setfit/<task>/ for classification
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self, schema: GLiNER2PipelineSchema, detector_key: str = "", detector_name: str = ""
|
|
36
|
+
) -> None:
|
|
37
|
+
self._schema = schema
|
|
38
|
+
self._detector_key = detector_key
|
|
39
|
+
self._detector_name = detector_name
|
|
40
|
+
self._model: Any | None = None
|
|
41
|
+
self._setfit_models: dict[str, Any] | None = None
|
|
42
|
+
self._setfit_labels: dict[str, list[str]] = {}
|
|
43
|
+
self._artifact_dir: Path | None = None
|
|
44
|
+
self._init_artifact_dir()
|
|
45
|
+
|
|
46
|
+
def _init_artifact_dir(self) -> None:
|
|
47
|
+
model_cfg = self._schema.model
|
|
48
|
+
if not model_cfg or not model_cfg.path:
|
|
49
|
+
return
|
|
50
|
+
candidate = Path(model_cfg.path)
|
|
51
|
+
if candidate.is_dir() and (candidate / "manifest.json").exists():
|
|
52
|
+
self._artifact_dir = candidate
|
|
53
|
+
logger.info(
|
|
54
|
+
"Artifact directory detected for detector '%s': %s",
|
|
55
|
+
self._detector_key,
|
|
56
|
+
candidate,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def run(self, text: str) -> PipelineResult:
|
|
60
|
+
start_ms = time.monotonic()
|
|
61
|
+
model = self._load_model()
|
|
62
|
+
if model is None:
|
|
63
|
+
return PipelineResult()
|
|
64
|
+
|
|
65
|
+
entity_schema = self._build_entity_schema()
|
|
66
|
+
classification_tasks = self._build_classification_tasks()
|
|
67
|
+
|
|
68
|
+
raw_entities: dict[str, list[dict[str, object]]] = {}
|
|
69
|
+
raw_classification: dict[str, dict[str, object]] = {}
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
if entity_schema:
|
|
73
|
+
raw = model.extract_entities(
|
|
74
|
+
text,
|
|
75
|
+
entity_schema,
|
|
76
|
+
threshold=0.0,
|
|
77
|
+
include_confidence=True,
|
|
78
|
+
include_spans=True,
|
|
79
|
+
)
|
|
80
|
+
raw_entities = _normalise_entity_output(raw, text)
|
|
81
|
+
|
|
82
|
+
for task_name, labels in classification_tasks.items():
|
|
83
|
+
setfit = self._get_setfit_model(task_name)
|
|
84
|
+
if setfit is not None:
|
|
85
|
+
raw_cls = self._run_setfit(setfit, task_name, text)
|
|
86
|
+
else:
|
|
87
|
+
raw_cls = model.classify(text, labels, threshold=0.0)
|
|
88
|
+
raw_classification[task_name] = _normalise_classification_output(raw_cls)
|
|
89
|
+
|
|
90
|
+
except Exception as exc: # pragma: no cover - runtime specific
|
|
91
|
+
logger.error("GLiNER2 pipeline failed for detector '%s': %s", self._detector_key, exc)
|
|
92
|
+
return PipelineResult()
|
|
93
|
+
|
|
94
|
+
validation = self._schema.validation or PipelineValidationConfig()
|
|
95
|
+
filtered_entities = _apply_entity_validation(raw_entities, validation, self._schema)
|
|
96
|
+
filtered_classification = _apply_classification_validation(raw_classification, validation)
|
|
97
|
+
|
|
98
|
+
latency_ms = round((time.monotonic() - start_ms) * 1000)
|
|
99
|
+
model_cfg = self._schema.model
|
|
100
|
+
model_name = (
|
|
101
|
+
model_cfg.name if model_cfg else _DEFAULT_GLINER2_MODEL
|
|
102
|
+
) or _DEFAULT_GLINER2_MODEL
|
|
103
|
+
runner_tag = "GLINER2+ARTIFACT" if self._artifact_dir else "GLINER2"
|
|
104
|
+
|
|
105
|
+
return PipelineResult(
|
|
106
|
+
entities=filtered_entities,
|
|
107
|
+
classification=filtered_classification,
|
|
108
|
+
metadata={
|
|
109
|
+
"model": model_name,
|
|
110
|
+
"runner": runner_tag,
|
|
111
|
+
"latency_ms": latency_ms,
|
|
112
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
113
|
+
},
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def _load_model(self) -> Any | None:
|
|
117
|
+
if self._model is not None:
|
|
118
|
+
return self._model
|
|
119
|
+
|
|
120
|
+
if self._artifact_dir is not None:
|
|
121
|
+
gliner_path = self._artifact_dir / "gliner2"
|
|
122
|
+
if gliner_path.is_dir():
|
|
123
|
+
model_name = str(gliner_path)
|
|
124
|
+
logger.info(
|
|
125
|
+
"Loading fine-tuned GLiNER2 from '%s' for detector '%s'",
|
|
126
|
+
model_name,
|
|
127
|
+
self._detector_key,
|
|
128
|
+
)
|
|
129
|
+
try:
|
|
130
|
+
gliner2_module = require_module(
|
|
131
|
+
"gliner2", "custom", ["classification", "detectors"]
|
|
132
|
+
)
|
|
133
|
+
self._model = gliner2_module.GLiNER2.from_pretrained(model_name)
|
|
134
|
+
return self._model
|
|
135
|
+
except Exception as exc:
|
|
136
|
+
logger.warning("Failed to load fine-tuned GLiNER2, falling back: %s", exc)
|
|
137
|
+
|
|
138
|
+
model_cfg = self._schema.model
|
|
139
|
+
if model_cfg and model_cfg.path and not self._artifact_dir:
|
|
140
|
+
model_name = model_cfg.path
|
|
141
|
+
elif model_cfg and model_cfg.name:
|
|
142
|
+
model_name = model_cfg.name
|
|
143
|
+
else:
|
|
144
|
+
model_name = _DEFAULT_GLINER2_MODEL
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
gliner2_module = require_module("gliner2", "custom", ["classification", "detectors"])
|
|
148
|
+
self._model = gliner2_module.GLiNER2.from_pretrained(model_name)
|
|
149
|
+
logger.info(
|
|
150
|
+
"GLiNER2 model '%s' loaded for detector '%s'", model_name, self._detector_key
|
|
151
|
+
)
|
|
152
|
+
return self._model
|
|
153
|
+
except MissingDependencyError:
|
|
154
|
+
raise
|
|
155
|
+
except Exception as exc: # pragma: no cover - environment specific
|
|
156
|
+
logger.warning(
|
|
157
|
+
"Failed to load GLiNER2 model '%s' for detector '%s': %s",
|
|
158
|
+
model_name,
|
|
159
|
+
self._detector_key,
|
|
160
|
+
exc,
|
|
161
|
+
)
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
def _get_setfit_model(self, task_name: str) -> Any | None:
|
|
165
|
+
"""Return the SetFit model for task_name if a trained artifact exists."""
|
|
166
|
+
if self._artifact_dir is None:
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
if self._setfit_models is None:
|
|
170
|
+
self._setfit_models = {}
|
|
171
|
+
|
|
172
|
+
if task_name in self._setfit_models:
|
|
173
|
+
return self._setfit_models[task_name]
|
|
174
|
+
|
|
175
|
+
model_path = self._artifact_dir / "setfit" / task_name
|
|
176
|
+
labels_path = model_path / "labels.json"
|
|
177
|
+
if not model_path.is_dir() or not labels_path.exists():
|
|
178
|
+
self._setfit_models[task_name] = None
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
from setfit import SetFitModel # type: ignore[import-untyped]
|
|
183
|
+
|
|
184
|
+
sfm = SetFitModel.from_pretrained(str(model_path))
|
|
185
|
+
self._setfit_models[task_name] = sfm
|
|
186
|
+
self._setfit_labels[task_name] = json.loads(labels_path.read_text())
|
|
187
|
+
logger.info("SetFit model for task '%s' loaded from '%s'", task_name, model_path)
|
|
188
|
+
return sfm
|
|
189
|
+
except Exception as exc:
|
|
190
|
+
logger.warning("Failed to load SetFit model for task '%s': %s", task_name, exc)
|
|
191
|
+
self._setfit_models[task_name] = None
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
def _run_setfit(self, model: Any, task_name: str, text: str) -> dict[str, object]:
|
|
195
|
+
"""Run a SetFit model and return a label/confidence dict."""
|
|
196
|
+
try:
|
|
197
|
+
import torch # type: ignore[import-untyped]
|
|
198
|
+
|
|
199
|
+
labels = self._setfit_labels.get(task_name, [])
|
|
200
|
+
with torch.no_grad():
|
|
201
|
+
probs = model.predict_proba([text])
|
|
202
|
+
prob_row = probs[0].tolist() if hasattr(probs[0], "tolist") else list(probs[0])
|
|
203
|
+
best_idx = int(max(range(len(prob_row)), key=lambda i: prob_row[i]))
|
|
204
|
+
best_label = labels[best_idx] if best_idx < len(labels) else str(best_idx)
|
|
205
|
+
best_conf = float(prob_row[best_idx]) if best_idx < len(prob_row) else 0.0
|
|
206
|
+
return {"label": best_label, "confidence": round(best_conf, 4)}
|
|
207
|
+
except Exception as exc:
|
|
208
|
+
logger.warning("SetFit inference failed for task '%s': %s", task_name, exc)
|
|
209
|
+
return {}
|
|
210
|
+
|
|
211
|
+
def _build_entity_schema(self) -> dict[str, str]:
|
|
212
|
+
entities = self._schema.entities or {}
|
|
213
|
+
return {
|
|
214
|
+
label: defn.description if isinstance(defn, PipelineEntityDefinition) else str(defn)
|
|
215
|
+
for label, defn in entities.items()
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
def _build_classification_tasks(self) -> dict[str, list[str]]:
|
|
219
|
+
return {task: defn.labels for task, defn in (self._schema.classification or {}).items()}
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# ── Normalisation helpers (used only by GLiNER2Runner) ────────────────────────
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _normalise_entity_output(raw: dict[str, Any], text: str) -> dict[str, list[dict[str, object]]]:
|
|
226
|
+
result: dict[str, list[dict[str, object]]] = {}
|
|
227
|
+
entities = raw.get("entities", raw)
|
|
228
|
+
if not isinstance(entities, dict):
|
|
229
|
+
return result
|
|
230
|
+
|
|
231
|
+
for label, spans in entities.items():
|
|
232
|
+
span_list: list[Any] = spans if isinstance(spans, list) else [spans]
|
|
233
|
+
normalised = [s for s in (_normalise_span(span, text) for span in span_list) if s]
|
|
234
|
+
if normalised:
|
|
235
|
+
result[label] = normalised
|
|
236
|
+
|
|
237
|
+
return result
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _normalise_span(span: Any, text: str) -> dict[str, object] | None:
|
|
241
|
+
if isinstance(span, dict):
|
|
242
|
+
value = str(span.get("text", "")).strip()
|
|
243
|
+
confidence = float(span.get("confidence", span.get("score", 0.0)))
|
|
244
|
+
start = span.get("start")
|
|
245
|
+
end = span.get("end")
|
|
246
|
+
else:
|
|
247
|
+
value = str(span).strip()
|
|
248
|
+
confidence = 1.0
|
|
249
|
+
start = None
|
|
250
|
+
end = None
|
|
251
|
+
|
|
252
|
+
if not value and isinstance(start, int) and isinstance(end, int):
|
|
253
|
+
value = text[start:end].strip()
|
|
254
|
+
if not value:
|
|
255
|
+
return None
|
|
256
|
+
|
|
257
|
+
if not isinstance(start, int) or not isinstance(end, int):
|
|
258
|
+
start = text.find(value)
|
|
259
|
+
end = start + len(value) if start >= 0 else -1
|
|
260
|
+
|
|
261
|
+
if start < 0:
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
return {"value": value, "confidence": round(confidence, 4), "start": start, "end": end}
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _normalise_classification_output(raw: Any) -> dict[str, object]:
|
|
268
|
+
if isinstance(raw, dict):
|
|
269
|
+
label = raw.get("label", "")
|
|
270
|
+
confidence = float(raw.get("confidence", raw.get("score", 0.0)))
|
|
271
|
+
elif isinstance(raw, (list, tuple)) and raw:
|
|
272
|
+
best = max(raw, key=lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0)
|
|
273
|
+
label = best.get("label", "") if isinstance(best, dict) else str(best)
|
|
274
|
+
confidence = float(best.get("score", 0.0)) if isinstance(best, dict) else 1.0
|
|
275
|
+
else:
|
|
276
|
+
return {}
|
|
277
|
+
|
|
278
|
+
return {"label": label, "confidence": round(confidence, 4)}
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _apply_entity_validation(
|
|
282
|
+
entities: dict[str, list[dict[str, object]]],
|
|
283
|
+
validation: PipelineValidationConfig,
|
|
284
|
+
schema: GLiNER2PipelineSchema,
|
|
285
|
+
) -> dict[str, list[dict[str, object]]]:
|
|
286
|
+
threshold = validation.confidence_threshold or 0.7
|
|
287
|
+
result: dict[str, list[dict[str, object]]] = {}
|
|
288
|
+
|
|
289
|
+
for label, spans in entities.items():
|
|
290
|
+
passing = [
|
|
291
|
+
span
|
|
292
|
+
for span in spans
|
|
293
|
+
if isinstance(span.get("confidence"), (int, float))
|
|
294
|
+
and float(span["confidence"]) >= threshold # type: ignore[arg-type]
|
|
295
|
+
]
|
|
296
|
+
for rule in validation.rules or []:
|
|
297
|
+
if rule.field == label and rule.type == "regex" and rule.pattern:
|
|
298
|
+
try:
|
|
299
|
+
rx = re.compile(rule.pattern)
|
|
300
|
+
passing = [s for s in passing if rx.search(str(s.get("value", "")))]
|
|
301
|
+
except re.error as exc:
|
|
302
|
+
logger.warning("Invalid validation regex for field '%s': %s", label, exc)
|
|
303
|
+
if passing:
|
|
304
|
+
result[label] = passing
|
|
305
|
+
|
|
306
|
+
for label, defn in (schema.entities or {}).items():
|
|
307
|
+
if isinstance(defn, PipelineEntityDefinition) and defn.required and label not in result:
|
|
308
|
+
logger.debug("Required entity '%s' not found — suppressing all findings", label)
|
|
309
|
+
return {}
|
|
310
|
+
|
|
311
|
+
return result
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _apply_classification_validation(
|
|
315
|
+
classification: dict[str, dict[str, object]],
|
|
316
|
+
validation: PipelineValidationConfig,
|
|
317
|
+
) -> dict[str, dict[str, object]]:
|
|
318
|
+
threshold = validation.confidence_threshold or 0.7
|
|
319
|
+
return {
|
|
320
|
+
task: outcome
|
|
321
|
+
for task, outcome in classification.items()
|
|
322
|
+
if isinstance(outcome.get("confidence", 0.0), (int, float))
|
|
323
|
+
and float(outcome["confidence"]) >= threshold # type: ignore[arg-type]
|
|
324
|
+
}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Image classification pipeline runner."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from ....models.generated_detectors import ImageClassificationPipelineSchema
|
|
10
|
+
from ....models.generated_single_asset_scan_results import DetectionResult
|
|
11
|
+
from ...dependencies import ensure_torch, require_module
|
|
12
|
+
from ._base import (
|
|
13
|
+
_DEFAULT_IMAGE_CLASSIFICATION_MODEL,
|
|
14
|
+
_IMAGE_CONTENT_TYPES,
|
|
15
|
+
BaseRunner,
|
|
16
|
+
_resolve_pipeline_severity,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ImageClassificationRunner(BaseRunner):
|
|
23
|
+
"""Image classification via a single HuggingFace image-classification pipeline."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
schema: ImageClassificationPipelineSchema,
|
|
28
|
+
detector_key: str = "",
|
|
29
|
+
detector_name: str = "",
|
|
30
|
+
) -> None:
|
|
31
|
+
self._schema = schema
|
|
32
|
+
self._detector_key = detector_key
|
|
33
|
+
self._detector_name = detector_name
|
|
34
|
+
ensure_torch("image_classification", ["custom", "detectors"])
|
|
35
|
+
transformers = require_module(
|
|
36
|
+
"transformers", "image_classification", ["custom", "detectors"]
|
|
37
|
+
)
|
|
38
|
+
self._pil = require_module("PIL.Image", "image_classification", ["custom", "detectors"])
|
|
39
|
+
model_id = schema.model or _DEFAULT_IMAGE_CLASSIFICATION_MODEL
|
|
40
|
+
pipeline_kwargs: dict[str, Any] = {
|
|
41
|
+
"model": model_id,
|
|
42
|
+
"device": schema.device or "cpu",
|
|
43
|
+
}
|
|
44
|
+
if schema.model_revision:
|
|
45
|
+
pipeline_kwargs["revision"] = schema.model_revision
|
|
46
|
+
if schema.top_k is not None:
|
|
47
|
+
pipeline_kwargs["top_k"] = schema.top_k
|
|
48
|
+
if schema.function_to_apply is not None:
|
|
49
|
+
pipeline_kwargs["function_to_apply"] = str(schema.function_to_apply)
|
|
50
|
+
self._pipe: Any = transformers.pipeline("image-classification", **pipeline_kwargs)
|
|
51
|
+
self._model_id = model_id
|
|
52
|
+
|
|
53
|
+
def run(self, text: str) -> None: # type: ignore[override] # pragma: no cover
|
|
54
|
+
raise NotImplementedError("ImageClassificationRunner uses detect() directly")
|
|
55
|
+
|
|
56
|
+
def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
|
|
57
|
+
if not content_type.startswith("image/"):
|
|
58
|
+
return []
|
|
59
|
+
if isinstance(content, str):
|
|
60
|
+
logger.warning("image_classification: received string content, expected bytes")
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
schema = self._schema
|
|
64
|
+
threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.0
|
|
65
|
+
results: list[DetectionResult] = []
|
|
66
|
+
try:
|
|
67
|
+
image = self._pil.open(io.BytesIO(content))
|
|
68
|
+
predictions: list[dict[str, Any]] = self._pipe(image) or []
|
|
69
|
+
for pred in predictions:
|
|
70
|
+
label: str = pred.get("label", "unknown")
|
|
71
|
+
score: float = float(pred.get("score", 0.0))
|
|
72
|
+
if score < threshold:
|
|
73
|
+
continue
|
|
74
|
+
severity = _resolve_pipeline_severity(label, schema.severity_map)
|
|
75
|
+
results.append(
|
|
76
|
+
self._make_result(
|
|
77
|
+
finding_type=f"classification:{label}",
|
|
78
|
+
category="CONTENT",
|
|
79
|
+
severity=severity,
|
|
80
|
+
confidence=score,
|
|
81
|
+
matched_content=f"Image classified as: {label} ({score:.3f})",
|
|
82
|
+
location=None,
|
|
83
|
+
metadata={
|
|
84
|
+
"image_size": f"{image.size[0]}x{image.size[1]}",
|
|
85
|
+
"image_mode": image.mode,
|
|
86
|
+
"model": self._model_id,
|
|
87
|
+
},
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
except Exception as exc:
|
|
91
|
+
logger.error(
|
|
92
|
+
"image_classification error (model=%s): %s", self._model_id, exc, exc_info=True
|
|
93
|
+
)
|
|
94
|
+
results.sort(key=lambda r: r.confidence, reverse=True)
|
|
95
|
+
return results
|
|
96
|
+
|
|
97
|
+
def get_supported_content_types(self) -> list[str]:
|
|
98
|
+
return list(_IMAGE_CONTENT_TYPES)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""LLM pipeline runner (stub — not yet implemented)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ....models.generated_detectors import LLMPipelineSchema, PipelineResult
|
|
6
|
+
from ._base import BaseRunner
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LLMRunner(BaseRunner):
|
|
10
|
+
"""LLM-based detection — not yet implemented."""
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self, schema: LLMPipelineSchema, detector_key: str = "", detector_name: str = ""
|
|
14
|
+
) -> None:
|
|
15
|
+
self._schema = schema
|
|
16
|
+
self._detector_key = detector_key
|
|
17
|
+
self._detector_name = detector_name
|
|
18
|
+
|
|
19
|
+
def run(self, text: str) -> PipelineResult: # pragma: no cover - stub
|
|
20
|
+
raise NotImplementedError(
|
|
21
|
+
f"LLM runner is not yet implemented (detector '{self._detector_key}')"
|
|
22
|
+
)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Object detection pipeline runner."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from ....models.generated_detectors import ObjectDetectionPipelineSchema
|
|
10
|
+
from ....models.generated_single_asset_scan_results import DetectionResult, Location
|
|
11
|
+
from ...dependencies import MissingDependencyError, ensure_torch, require_module
|
|
12
|
+
from ._base import _IMAGE_CONTENT_TYPES, BaseRunner, _resolve_pipeline_severity
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ObjectDetectionRunner(BaseRunner):
|
|
18
|
+
"""Object detection via a single HuggingFace object-detection pipeline."""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
schema: ObjectDetectionPipelineSchema,
|
|
23
|
+
detector_key: str = "",
|
|
24
|
+
detector_name: str = "",
|
|
25
|
+
) -> None:
|
|
26
|
+
self._schema = schema
|
|
27
|
+
self._detector_key = detector_key
|
|
28
|
+
self._detector_name = detector_name
|
|
29
|
+
ensure_torch("object_detection", ["custom", "detectors"])
|
|
30
|
+
transformers = require_module("transformers", "object_detection", ["custom", "detectors"])
|
|
31
|
+
self._pil = require_module("PIL.Image", "object_detection", ["custom", "detectors"])
|
|
32
|
+
pipeline_kwargs: dict[str, Any] = {
|
|
33
|
+
"model": schema.model,
|
|
34
|
+
"device": schema.device or "cpu",
|
|
35
|
+
}
|
|
36
|
+
if schema.model_revision:
|
|
37
|
+
pipeline_kwargs["revision"] = schema.model_revision
|
|
38
|
+
nms = getattr(schema.nms_threshold, "root", schema.nms_threshold)
|
|
39
|
+
if nms is not None:
|
|
40
|
+
pipeline_kwargs["threshold"] = nms
|
|
41
|
+
try:
|
|
42
|
+
self._pipe: Any = transformers.pipeline("object-detection", **pipeline_kwargs)
|
|
43
|
+
except ImportError as exc:
|
|
44
|
+
raise MissingDependencyError(
|
|
45
|
+
"object_detection",
|
|
46
|
+
["custom", "detectors"],
|
|
47
|
+
f"ObjectDetectionRunner requires additional dependencies: {exc}",
|
|
48
|
+
) from exc
|
|
49
|
+
|
|
50
|
+
def run(self, text: str) -> None: # type: ignore[override] # pragma: no cover
|
|
51
|
+
raise NotImplementedError("ObjectDetectionRunner uses detect() directly")
|
|
52
|
+
|
|
53
|
+
def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
|
|
54
|
+
if not content_type.startswith("image/"):
|
|
55
|
+
return []
|
|
56
|
+
if isinstance(content, str):
|
|
57
|
+
logger.warning("object_detection: received string content, expected bytes")
|
|
58
|
+
return []
|
|
59
|
+
|
|
60
|
+
schema = self._schema
|
|
61
|
+
threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
|
|
62
|
+
results: list[DetectionResult] = []
|
|
63
|
+
try:
|
|
64
|
+
image = self._pil.open(io.BytesIO(content))
|
|
65
|
+
detections: list[dict[str, Any]] = self._pipe(image) or []
|
|
66
|
+
for det in detections:
|
|
67
|
+
label: str = det.get("label", "unknown")
|
|
68
|
+
score: float = float(det.get("score", 0.0))
|
|
69
|
+
box: dict[str, int] = det.get("box", {})
|
|
70
|
+
if score < threshold:
|
|
71
|
+
continue
|
|
72
|
+
if schema.min_box_area is not None:
|
|
73
|
+
w = max(0, box.get("xmax", 0) - box.get("xmin", 0))
|
|
74
|
+
h = max(0, box.get("ymax", 0) - box.get("ymin", 0))
|
|
75
|
+
if w * h < schema.min_box_area:
|
|
76
|
+
continue
|
|
77
|
+
severity = _resolve_pipeline_severity(label, schema.severity_map)
|
|
78
|
+
results.append(
|
|
79
|
+
self._make_result(
|
|
80
|
+
finding_type=f"entity:{label}",
|
|
81
|
+
category="CONTENT",
|
|
82
|
+
severity=severity,
|
|
83
|
+
confidence=score,
|
|
84
|
+
matched_content=f"Detected: {label} ({score:.3f})",
|
|
85
|
+
location=Location(
|
|
86
|
+
description=(
|
|
87
|
+
f"box xmin={box.get('xmin')} ymin={box.get('ymin')}"
|
|
88
|
+
f" xmax={box.get('xmax')} ymax={box.get('ymax')}"
|
|
89
|
+
),
|
|
90
|
+
),
|
|
91
|
+
metadata={
|
|
92
|
+
"box": box,
|
|
93
|
+
"score": score,
|
|
94
|
+
"image_size": f"{image.size[0]}x{image.size[1]}",
|
|
95
|
+
"model": schema.model,
|
|
96
|
+
},
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
results.sort(key=lambda r: r.confidence, reverse=True)
|
|
100
|
+
if schema.top_k is not None:
|
|
101
|
+
results = results[: schema.top_k]
|
|
102
|
+
except Exception as exc:
|
|
103
|
+
logger.error("object_detection error (model=%s): %s", schema.model, exc, exc_info=True)
|
|
104
|
+
return results
|
|
105
|
+
|
|
106
|
+
def get_supported_content_types(self) -> list[str]:
|
|
107
|
+
return list(_IMAGE_CONTENT_TYPES)
|