classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,138 @@
1
+ """Feature extraction (dense embeddings) pipeline runner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from ....models.generated_detectors import FeatureExtractionPipelineSchema, Severity
9
+ from ....models.generated_single_asset_scan_results import DetectionResult
10
+ from ...dependencies import ensure_torch, require_module
11
+ from ._base import _TEXT_CONTENT_TYPES, BaseRunner
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def _chunk_text_with_offsets(
17
+ text: str, chunk_size: int | None, chunk_overlap: int
18
+ ) -> list[tuple[str, int]]:
19
+ """Return (chunk, char_offset) pairs. Returns [(text, 0)] when chunk_size is not set."""
20
+ if not chunk_size:
21
+ return [(text, 0)]
22
+ step = max(1, chunk_size - chunk_overlap)
23
+ return [(text[i : i + chunk_size], i) for i in range(0, len(text), step)]
24
+
25
+
26
+ def _pool_hidden(
27
+ hidden: list[list[float]], pooling: str, normalize: bool
28
+ ) -> list[float] | list[list[float]] | None:
29
+ """Apply pooling strategy to per-token hidden states."""
30
+ try:
31
+ import numpy as np # type: ignore[import-untyped]
32
+ except ImportError:
33
+ logger.warning("numpy is required for feature extraction pooling")
34
+ return None
35
+ arr = np.array(hidden, dtype=np.float32)
36
+ if pooling == "cls":
37
+ vector = arr[0]
38
+ elif pooling == "max":
39
+ vector = arr.max(axis=0)
40
+ elif pooling == "none":
41
+ return arr.tolist() # type: ignore[no-any-return]
42
+ else:
43
+ vector = arr.mean(axis=0)
44
+ if normalize:
45
+ norm = float(np.linalg.norm(vector))
46
+ if norm > 0:
47
+ vector = vector / norm
48
+ return vector.tolist() # type: ignore[no-any-return]
49
+
50
+
51
+ class FeatureExtractionRunner(BaseRunner):
52
+ """Dense vector embeddings via a single HuggingFace feature-extraction pipeline."""
53
+
54
+ def __init__(
55
+ self,
56
+ schema: FeatureExtractionPipelineSchema,
57
+ detector_key: str = "",
58
+ detector_name: str = "",
59
+ ) -> None:
60
+ self._schema = schema
61
+ self._detector_key = detector_key
62
+ self._detector_name = detector_name
63
+ ensure_torch("feature_extraction", ["custom", "detectors"])
64
+ transformers = require_module("transformers", "feature_extraction", ["custom", "detectors"])
65
+ truncation = schema.truncation if schema.truncation is not None else True
66
+ tokenizer_kwargs: dict[str, Any] = {"truncation": truncation}
67
+ if schema.max_length is not None:
68
+ tokenizer_kwargs["max_length"] = schema.max_length
69
+ pipeline_kwargs: dict[str, Any] = {
70
+ "model": schema.model,
71
+ "device": schema.device or "cpu",
72
+ "tokenizer_kwargs": tokenizer_kwargs,
73
+ }
74
+ if schema.model_revision:
75
+ pipeline_kwargs["revision"] = schema.model_revision
76
+ self._pipe: Any = transformers.pipeline("feature-extraction", **pipeline_kwargs)
77
+
78
+ def run(self, text: str) -> None: # type: ignore[override] # pragma: no cover
79
+ raise NotImplementedError("FeatureExtractionRunner uses detect() directly")
80
+
81
+ def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
82
+ if isinstance(content, bytes):
83
+ return []
84
+ if content_type not in _TEXT_CONTENT_TYPES:
85
+ return []
86
+ text = content.strip()
87
+ if not text:
88
+ return []
89
+
90
+ schema = self._schema
91
+ pooling = str(schema.pooling_strategy or "mean")
92
+ normalize = schema.normalize_embeddings if schema.normalize_embeddings is not None else True
93
+ chunk_size: int | None = getattr(schema.chunk_size, "root", schema.chunk_size)
94
+ chunk_overlap: int = getattr(schema.chunk_overlap, "root", schema.chunk_overlap) or 0
95
+
96
+ results: list[DetectionResult] = []
97
+ try:
98
+ for chunk, offset in _chunk_text_with_offsets(text, chunk_size, chunk_overlap):
99
+ raw: list[list[list[float]]] = self._pipe(chunk) or []
100
+ if not raw or not raw[0]:
101
+ continue
102
+ embedding = _pool_hidden(raw[0], pooling, normalize)
103
+ if embedding is None:
104
+ continue
105
+ dim: int | None
106
+ if pooling == "none":
107
+ dim = (
108
+ len(embedding[0]) if embedding and isinstance(embedding[0], list) else None
109
+ ) # type: ignore[index]
110
+ else:
111
+ dim = len(embedding) # type: ignore[arg-type]
112
+ results.append(
113
+ self._make_result(
114
+ finding_type="embedding",
115
+ category="CLASSIFICATION",
116
+ severity=Severity.info,
117
+ confidence=1.0,
118
+ matched_content=chunk[:256],
119
+ location=None,
120
+ metadata={
121
+ "embedding": embedding,
122
+ "dimension": dim,
123
+ "pooling_strategy": pooling,
124
+ "normalized": normalize,
125
+ "model": schema.model,
126
+ "chunk_offset": offset,
127
+ "chunk_length": len(chunk),
128
+ },
129
+ )
130
+ )
131
+ except Exception as exc:
132
+ logger.error(
133
+ "feature_extraction error (model=%s): %s", schema.model, exc, exc_info=True
134
+ )
135
+ return results
136
+
137
+ def get_supported_content_types(self) -> list[str]:
138
+ return list(_TEXT_CONTENT_TYPES)
@@ -0,0 +1,324 @@
1
+ """GLiNER2 pipeline runner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import re
8
+ import time
9
+ from datetime import UTC, datetime
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from ....models.generated_detectors import (
14
+ GLiNER2PipelineSchema,
15
+ PipelineEntityDefinition,
16
+ PipelineResult,
17
+ PipelineValidationConfig,
18
+ )
19
+ from ...dependencies import MissingDependencyError, require_module
20
+ from ._base import _DEFAULT_GLINER2_MODEL, BaseRunner
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class GLiNER2Runner(BaseRunner):
26
+ """Execute a GLiNER2 pipeline: single-model pass for entities + classification.
27
+
28
+ When the pipeline schema's model.path points to a trained artifact directory
29
+ (written by trainer.py), the runner:
30
+ - loads the fine-tuned GLiNER2 weights from <path>/gliner2/ if present
31
+ - uses per-task SetFit models from <path>/setfit/<task>/ for classification
32
+ """
33
+
34
+ def __init__(
35
+ self, schema: GLiNER2PipelineSchema, detector_key: str = "", detector_name: str = ""
36
+ ) -> None:
37
+ self._schema = schema
38
+ self._detector_key = detector_key
39
+ self._detector_name = detector_name
40
+ self._model: Any | None = None
41
+ self._setfit_models: dict[str, Any] | None = None
42
+ self._setfit_labels: dict[str, list[str]] = {}
43
+ self._artifact_dir: Path | None = None
44
+ self._init_artifact_dir()
45
+
46
+ def _init_artifact_dir(self) -> None:
47
+ model_cfg = self._schema.model
48
+ if not model_cfg or not model_cfg.path:
49
+ return
50
+ candidate = Path(model_cfg.path)
51
+ if candidate.is_dir() and (candidate / "manifest.json").exists():
52
+ self._artifact_dir = candidate
53
+ logger.info(
54
+ "Artifact directory detected for detector '%s': %s",
55
+ self._detector_key,
56
+ candidate,
57
+ )
58
+
59
+ def run(self, text: str) -> PipelineResult:
60
+ start_ms = time.monotonic()
61
+ model = self._load_model()
62
+ if model is None:
63
+ return PipelineResult()
64
+
65
+ entity_schema = self._build_entity_schema()
66
+ classification_tasks = self._build_classification_tasks()
67
+
68
+ raw_entities: dict[str, list[dict[str, object]]] = {}
69
+ raw_classification: dict[str, dict[str, object]] = {}
70
+
71
+ try:
72
+ if entity_schema:
73
+ raw = model.extract_entities(
74
+ text,
75
+ entity_schema,
76
+ threshold=0.0,
77
+ include_confidence=True,
78
+ include_spans=True,
79
+ )
80
+ raw_entities = _normalise_entity_output(raw, text)
81
+
82
+ for task_name, labels in classification_tasks.items():
83
+ setfit = self._get_setfit_model(task_name)
84
+ if setfit is not None:
85
+ raw_cls = self._run_setfit(setfit, task_name, text)
86
+ else:
87
+ raw_cls = model.classify(text, labels, threshold=0.0)
88
+ raw_classification[task_name] = _normalise_classification_output(raw_cls)
89
+
90
+ except Exception as exc: # pragma: no cover - runtime specific
91
+ logger.error("GLiNER2 pipeline failed for detector '%s': %s", self._detector_key, exc)
92
+ return PipelineResult()
93
+
94
+ validation = self._schema.validation or PipelineValidationConfig()
95
+ filtered_entities = _apply_entity_validation(raw_entities, validation, self._schema)
96
+ filtered_classification = _apply_classification_validation(raw_classification, validation)
97
+
98
+ latency_ms = round((time.monotonic() - start_ms) * 1000)
99
+ model_cfg = self._schema.model
100
+ model_name = (
101
+ model_cfg.name if model_cfg else _DEFAULT_GLINER2_MODEL
102
+ ) or _DEFAULT_GLINER2_MODEL
103
+ runner_tag = "GLINER2+ARTIFACT" if self._artifact_dir else "GLINER2"
104
+
105
+ return PipelineResult(
106
+ entities=filtered_entities,
107
+ classification=filtered_classification,
108
+ metadata={
109
+ "model": model_name,
110
+ "runner": runner_tag,
111
+ "latency_ms": latency_ms,
112
+ "timestamp": datetime.now(UTC).isoformat(),
113
+ },
114
+ )
115
+
116
+ def _load_model(self) -> Any | None:
117
+ if self._model is not None:
118
+ return self._model
119
+
120
+ if self._artifact_dir is not None:
121
+ gliner_path = self._artifact_dir / "gliner2"
122
+ if gliner_path.is_dir():
123
+ model_name = str(gliner_path)
124
+ logger.info(
125
+ "Loading fine-tuned GLiNER2 from '%s' for detector '%s'",
126
+ model_name,
127
+ self._detector_key,
128
+ )
129
+ try:
130
+ gliner2_module = require_module(
131
+ "gliner2", "custom", ["classification", "detectors"]
132
+ )
133
+ self._model = gliner2_module.GLiNER2.from_pretrained(model_name)
134
+ return self._model
135
+ except Exception as exc:
136
+ logger.warning("Failed to load fine-tuned GLiNER2, falling back: %s", exc)
137
+
138
+ model_cfg = self._schema.model
139
+ if model_cfg and model_cfg.path and not self._artifact_dir:
140
+ model_name = model_cfg.path
141
+ elif model_cfg and model_cfg.name:
142
+ model_name = model_cfg.name
143
+ else:
144
+ model_name = _DEFAULT_GLINER2_MODEL
145
+
146
+ try:
147
+ gliner2_module = require_module("gliner2", "custom", ["classification", "detectors"])
148
+ self._model = gliner2_module.GLiNER2.from_pretrained(model_name)
149
+ logger.info(
150
+ "GLiNER2 model '%s' loaded for detector '%s'", model_name, self._detector_key
151
+ )
152
+ return self._model
153
+ except MissingDependencyError:
154
+ raise
155
+ except Exception as exc: # pragma: no cover - environment specific
156
+ logger.warning(
157
+ "Failed to load GLiNER2 model '%s' for detector '%s': %s",
158
+ model_name,
159
+ self._detector_key,
160
+ exc,
161
+ )
162
+ return None
163
+
164
+ def _get_setfit_model(self, task_name: str) -> Any | None:
165
+ """Return the SetFit model for task_name if a trained artifact exists."""
166
+ if self._artifact_dir is None:
167
+ return None
168
+
169
+ if self._setfit_models is None:
170
+ self._setfit_models = {}
171
+
172
+ if task_name in self._setfit_models:
173
+ return self._setfit_models[task_name]
174
+
175
+ model_path = self._artifact_dir / "setfit" / task_name
176
+ labels_path = model_path / "labels.json"
177
+ if not model_path.is_dir() or not labels_path.exists():
178
+ self._setfit_models[task_name] = None
179
+ return None
180
+
181
+ try:
182
+ from setfit import SetFitModel # type: ignore[import-untyped]
183
+
184
+ sfm = SetFitModel.from_pretrained(str(model_path))
185
+ self._setfit_models[task_name] = sfm
186
+ self._setfit_labels[task_name] = json.loads(labels_path.read_text())
187
+ logger.info("SetFit model for task '%s' loaded from '%s'", task_name, model_path)
188
+ return sfm
189
+ except Exception as exc:
190
+ logger.warning("Failed to load SetFit model for task '%s': %s", task_name, exc)
191
+ self._setfit_models[task_name] = None
192
+ return None
193
+
194
+ def _run_setfit(self, model: Any, task_name: str, text: str) -> dict[str, object]:
195
+ """Run a SetFit model and return a label/confidence dict."""
196
+ try:
197
+ import torch # type: ignore[import-untyped]
198
+
199
+ labels = self._setfit_labels.get(task_name, [])
200
+ with torch.no_grad():
201
+ probs = model.predict_proba([text])
202
+ prob_row = probs[0].tolist() if hasattr(probs[0], "tolist") else list(probs[0])
203
+ best_idx = int(max(range(len(prob_row)), key=lambda i: prob_row[i]))
204
+ best_label = labels[best_idx] if best_idx < len(labels) else str(best_idx)
205
+ best_conf = float(prob_row[best_idx]) if best_idx < len(prob_row) else 0.0
206
+ return {"label": best_label, "confidence": round(best_conf, 4)}
207
+ except Exception as exc:
208
+ logger.warning("SetFit inference failed for task '%s': %s", task_name, exc)
209
+ return {}
210
+
211
+ def _build_entity_schema(self) -> dict[str, str]:
212
+ entities = self._schema.entities or {}
213
+ return {
214
+ label: defn.description if isinstance(defn, PipelineEntityDefinition) else str(defn)
215
+ for label, defn in entities.items()
216
+ }
217
+
218
+ def _build_classification_tasks(self) -> dict[str, list[str]]:
219
+ return {task: defn.labels for task, defn in (self._schema.classification or {}).items()}
220
+
221
+
222
+ # ── Normalisation helpers (used only by GLiNER2Runner) ────────────────────────
223
+
224
+
225
+ def _normalise_entity_output(raw: dict[str, Any], text: str) -> dict[str, list[dict[str, object]]]:
226
+ result: dict[str, list[dict[str, object]]] = {}
227
+ entities = raw.get("entities", raw)
228
+ if not isinstance(entities, dict):
229
+ return result
230
+
231
+ for label, spans in entities.items():
232
+ span_list: list[Any] = spans if isinstance(spans, list) else [spans]
233
+ normalised = [s for s in (_normalise_span(span, text) for span in span_list) if s]
234
+ if normalised:
235
+ result[label] = normalised
236
+
237
+ return result
238
+
239
+
240
+ def _normalise_span(span: Any, text: str) -> dict[str, object] | None:
241
+ if isinstance(span, dict):
242
+ value = str(span.get("text", "")).strip()
243
+ confidence = float(span.get("confidence", span.get("score", 0.0)))
244
+ start = span.get("start")
245
+ end = span.get("end")
246
+ else:
247
+ value = str(span).strip()
248
+ confidence = 1.0
249
+ start = None
250
+ end = None
251
+
252
+ if not value and isinstance(start, int) and isinstance(end, int):
253
+ value = text[start:end].strip()
254
+ if not value:
255
+ return None
256
+
257
+ if not isinstance(start, int) or not isinstance(end, int):
258
+ start = text.find(value)
259
+ end = start + len(value) if start >= 0 else -1
260
+
261
+ if start < 0:
262
+ return None
263
+
264
+ return {"value": value, "confidence": round(confidence, 4), "start": start, "end": end}
265
+
266
+
267
+ def _normalise_classification_output(raw: Any) -> dict[str, object]:
268
+ if isinstance(raw, dict):
269
+ label = raw.get("label", "")
270
+ confidence = float(raw.get("confidence", raw.get("score", 0.0)))
271
+ elif isinstance(raw, (list, tuple)) and raw:
272
+ best = max(raw, key=lambda x: x.get("score", 0.0) if isinstance(x, dict) else 0.0)
273
+ label = best.get("label", "") if isinstance(best, dict) else str(best)
274
+ confidence = float(best.get("score", 0.0)) if isinstance(best, dict) else 1.0
275
+ else:
276
+ return {}
277
+
278
+ return {"label": label, "confidence": round(confidence, 4)}
279
+
280
+
281
+ def _apply_entity_validation(
282
+ entities: dict[str, list[dict[str, object]]],
283
+ validation: PipelineValidationConfig,
284
+ schema: GLiNER2PipelineSchema,
285
+ ) -> dict[str, list[dict[str, object]]]:
286
+ threshold = validation.confidence_threshold or 0.7
287
+ result: dict[str, list[dict[str, object]]] = {}
288
+
289
+ for label, spans in entities.items():
290
+ passing = [
291
+ span
292
+ for span in spans
293
+ if isinstance(span.get("confidence"), (int, float))
294
+ and float(span["confidence"]) >= threshold # type: ignore[arg-type]
295
+ ]
296
+ for rule in validation.rules or []:
297
+ if rule.field == label and rule.type == "regex" and rule.pattern:
298
+ try:
299
+ rx = re.compile(rule.pattern)
300
+ passing = [s for s in passing if rx.search(str(s.get("value", "")))]
301
+ except re.error as exc:
302
+ logger.warning("Invalid validation regex for field '%s': %s", label, exc)
303
+ if passing:
304
+ result[label] = passing
305
+
306
+ for label, defn in (schema.entities or {}).items():
307
+ if isinstance(defn, PipelineEntityDefinition) and defn.required and label not in result:
308
+ logger.debug("Required entity '%s' not found — suppressing all findings", label)
309
+ return {}
310
+
311
+ return result
312
+
313
+
314
+ def _apply_classification_validation(
315
+ classification: dict[str, dict[str, object]],
316
+ validation: PipelineValidationConfig,
317
+ ) -> dict[str, dict[str, object]]:
318
+ threshold = validation.confidence_threshold or 0.7
319
+ return {
320
+ task: outcome
321
+ for task, outcome in classification.items()
322
+ if isinstance(outcome.get("confidence", 0.0), (int, float))
323
+ and float(outcome["confidence"]) >= threshold # type: ignore[arg-type]
324
+ }
@@ -0,0 +1,98 @@
1
+ """Image classification pipeline runner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import logging
7
+ from typing import Any
8
+
9
+ from ....models.generated_detectors import ImageClassificationPipelineSchema
10
+ from ....models.generated_single_asset_scan_results import DetectionResult
11
+ from ...dependencies import ensure_torch, require_module
12
+ from ._base import (
13
+ _DEFAULT_IMAGE_CLASSIFICATION_MODEL,
14
+ _IMAGE_CONTENT_TYPES,
15
+ BaseRunner,
16
+ _resolve_pipeline_severity,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class ImageClassificationRunner(BaseRunner):
23
+ """Image classification via a single HuggingFace image-classification pipeline."""
24
+
25
+ def __init__(
26
+ self,
27
+ schema: ImageClassificationPipelineSchema,
28
+ detector_key: str = "",
29
+ detector_name: str = "",
30
+ ) -> None:
31
+ self._schema = schema
32
+ self._detector_key = detector_key
33
+ self._detector_name = detector_name
34
+ ensure_torch("image_classification", ["custom", "detectors"])
35
+ transformers = require_module(
36
+ "transformers", "image_classification", ["custom", "detectors"]
37
+ )
38
+ self._pil = require_module("PIL.Image", "image_classification", ["custom", "detectors"])
39
+ model_id = schema.model or _DEFAULT_IMAGE_CLASSIFICATION_MODEL
40
+ pipeline_kwargs: dict[str, Any] = {
41
+ "model": model_id,
42
+ "device": schema.device or "cpu",
43
+ }
44
+ if schema.model_revision:
45
+ pipeline_kwargs["revision"] = schema.model_revision
46
+ if schema.top_k is not None:
47
+ pipeline_kwargs["top_k"] = schema.top_k
48
+ if schema.function_to_apply is not None:
49
+ pipeline_kwargs["function_to_apply"] = str(schema.function_to_apply)
50
+ self._pipe: Any = transformers.pipeline("image-classification", **pipeline_kwargs)
51
+ self._model_id = model_id
52
+
53
+ def run(self, text: str) -> None: # type: ignore[override] # pragma: no cover
54
+ raise NotImplementedError("ImageClassificationRunner uses detect() directly")
55
+
56
+ def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
57
+ if not content_type.startswith("image/"):
58
+ return []
59
+ if isinstance(content, str):
60
+ logger.warning("image_classification: received string content, expected bytes")
61
+ return []
62
+
63
+ schema = self._schema
64
+ threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.0
65
+ results: list[DetectionResult] = []
66
+ try:
67
+ image = self._pil.open(io.BytesIO(content))
68
+ predictions: list[dict[str, Any]] = self._pipe(image) or []
69
+ for pred in predictions:
70
+ label: str = pred.get("label", "unknown")
71
+ score: float = float(pred.get("score", 0.0))
72
+ if score < threshold:
73
+ continue
74
+ severity = _resolve_pipeline_severity(label, schema.severity_map)
75
+ results.append(
76
+ self._make_result(
77
+ finding_type=f"classification:{label}",
78
+ category="CONTENT",
79
+ severity=severity,
80
+ confidence=score,
81
+ matched_content=f"Image classified as: {label} ({score:.3f})",
82
+ location=None,
83
+ metadata={
84
+ "image_size": f"{image.size[0]}x{image.size[1]}",
85
+ "image_mode": image.mode,
86
+ "model": self._model_id,
87
+ },
88
+ )
89
+ )
90
+ except Exception as exc:
91
+ logger.error(
92
+ "image_classification error (model=%s): %s", self._model_id, exc, exc_info=True
93
+ )
94
+ results.sort(key=lambda r: r.confidence, reverse=True)
95
+ return results
96
+
97
+ def get_supported_content_types(self) -> list[str]:
98
+ return list(_IMAGE_CONTENT_TYPES)
@@ -0,0 +1,22 @@
1
+ """LLM pipeline runner (stub — not yet implemented)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ....models.generated_detectors import LLMPipelineSchema, PipelineResult
6
+ from ._base import BaseRunner
7
+
8
+
9
+ class LLMRunner(BaseRunner):
10
+ """LLM-based detection — not yet implemented."""
11
+
12
+ def __init__(
13
+ self, schema: LLMPipelineSchema, detector_key: str = "", detector_name: str = ""
14
+ ) -> None:
15
+ self._schema = schema
16
+ self._detector_key = detector_key
17
+ self._detector_name = detector_name
18
+
19
+ def run(self, text: str) -> PipelineResult: # pragma: no cover - stub
20
+ raise NotImplementedError(
21
+ f"LLM runner is not yet implemented (detector '{self._detector_key}')"
22
+ )
@@ -0,0 +1,107 @@
1
+ """Object detection pipeline runner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import logging
7
+ from typing import Any
8
+
9
+ from ....models.generated_detectors import ObjectDetectionPipelineSchema
10
+ from ....models.generated_single_asset_scan_results import DetectionResult, Location
11
+ from ...dependencies import MissingDependencyError, ensure_torch, require_module
12
+ from ._base import _IMAGE_CONTENT_TYPES, BaseRunner, _resolve_pipeline_severity
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class ObjectDetectionRunner(BaseRunner):
18
+ """Object detection via a single HuggingFace object-detection pipeline."""
19
+
20
+ def __init__(
21
+ self,
22
+ schema: ObjectDetectionPipelineSchema,
23
+ detector_key: str = "",
24
+ detector_name: str = "",
25
+ ) -> None:
26
+ self._schema = schema
27
+ self._detector_key = detector_key
28
+ self._detector_name = detector_name
29
+ ensure_torch("object_detection", ["custom", "detectors"])
30
+ transformers = require_module("transformers", "object_detection", ["custom", "detectors"])
31
+ self._pil = require_module("PIL.Image", "object_detection", ["custom", "detectors"])
32
+ pipeline_kwargs: dict[str, Any] = {
33
+ "model": schema.model,
34
+ "device": schema.device or "cpu",
35
+ }
36
+ if schema.model_revision:
37
+ pipeline_kwargs["revision"] = schema.model_revision
38
+ nms = getattr(schema.nms_threshold, "root", schema.nms_threshold)
39
+ if nms is not None:
40
+ pipeline_kwargs["threshold"] = nms
41
+ try:
42
+ self._pipe: Any = transformers.pipeline("object-detection", **pipeline_kwargs)
43
+ except ImportError as exc:
44
+ raise MissingDependencyError(
45
+ "object_detection",
46
+ ["custom", "detectors"],
47
+ f"ObjectDetectionRunner requires additional dependencies: {exc}",
48
+ ) from exc
49
+
50
+ def run(self, text: str) -> None: # type: ignore[override] # pragma: no cover
51
+ raise NotImplementedError("ObjectDetectionRunner uses detect() directly")
52
+
53
+ def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
54
+ if not content_type.startswith("image/"):
55
+ return []
56
+ if isinstance(content, str):
57
+ logger.warning("object_detection: received string content, expected bytes")
58
+ return []
59
+
60
+ schema = self._schema
61
+ threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
62
+ results: list[DetectionResult] = []
63
+ try:
64
+ image = self._pil.open(io.BytesIO(content))
65
+ detections: list[dict[str, Any]] = self._pipe(image) or []
66
+ for det in detections:
67
+ label: str = det.get("label", "unknown")
68
+ score: float = float(det.get("score", 0.0))
69
+ box: dict[str, int] = det.get("box", {})
70
+ if score < threshold:
71
+ continue
72
+ if schema.min_box_area is not None:
73
+ w = max(0, box.get("xmax", 0) - box.get("xmin", 0))
74
+ h = max(0, box.get("ymax", 0) - box.get("ymin", 0))
75
+ if w * h < schema.min_box_area:
76
+ continue
77
+ severity = _resolve_pipeline_severity(label, schema.severity_map)
78
+ results.append(
79
+ self._make_result(
80
+ finding_type=f"entity:{label}",
81
+ category="CONTENT",
82
+ severity=severity,
83
+ confidence=score,
84
+ matched_content=f"Detected: {label} ({score:.3f})",
85
+ location=Location(
86
+ description=(
87
+ f"box xmin={box.get('xmin')} ymin={box.get('ymin')}"
88
+ f" xmax={box.get('xmax')} ymax={box.get('ymax')}"
89
+ ),
90
+ ),
91
+ metadata={
92
+ "box": box,
93
+ "score": score,
94
+ "image_size": f"{image.size[0]}x{image.size[1]}",
95
+ "model": schema.model,
96
+ },
97
+ )
98
+ )
99
+ results.sort(key=lambda r: r.confidence, reverse=True)
100
+ if schema.top_k is not None:
101
+ results = results[: schema.top_k]
102
+ except Exception as exc:
103
+ logger.error("object_detection error (model=%s): %s", schema.model, exc, exc_info=True)
104
+ return results
105
+
106
+ def get_supported_content_types(self) -> list[str]:
107
+ return list(_IMAGE_CONTENT_TYPES)