classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
src/main.py
ADDED
|
@@ -0,0 +1,608 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, cast
|
|
9
|
+
|
|
10
|
+
from .outputs import create_output_sink
|
|
11
|
+
from .sources import get_source, list_available_sources
|
|
12
|
+
from .utils.validation import validate_input, validate_test_connection
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_TIMEOUT_PHRASES = ("timed out", "timeout", "connection reset", "errno 110", "connection refused")
|
|
19
|
+
_TIMEOUT_MYSQL_CODES = {2003, 2006, 2013}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _is_timeout_error(exc: BaseException) -> bool:
|
|
23
|
+
"""Return True when exc represents a connection/read timeout or unreachable host."""
|
|
24
|
+
exc_str = str(exc).lower()
|
|
25
|
+
if any(phrase in exc_str for phrase in _TIMEOUT_PHRASES):
|
|
26
|
+
return True
|
|
27
|
+
if "timeout" in type(exc).__name__.lower():
|
|
28
|
+
return True
|
|
29
|
+
args = getattr(exc, "args", ())
|
|
30
|
+
if args and isinstance(args[0], int) and args[0] in _TIMEOUT_MYSQL_CODES:
|
|
31
|
+
return True
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _sanitize_for_json(value: Any) -> Any:
|
|
36
|
+
"""Recursively replace isolated surrogate code points before JSON encoding."""
|
|
37
|
+
if isinstance(value, str):
|
|
38
|
+
return _SURROGATE_RE.sub("\ufffd", value)
|
|
39
|
+
if isinstance(value, list):
|
|
40
|
+
return [_sanitize_for_json(item) for item in value]
|
|
41
|
+
if isinstance(value, dict):
|
|
42
|
+
return {key: _sanitize_for_json(item) for key, item in value.items()}
|
|
43
|
+
return value
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def setup_logging() -> None:
|
|
47
|
+
"""Configure the logging facility."""
|
|
48
|
+
logging.basicConfig(
|
|
49
|
+
level=logging.INFO,
|
|
50
|
+
format="%(levelname)s:%(name)s: %(message)s",
|
|
51
|
+
stream=sys.stderr,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def load_local_env() -> None:
|
|
56
|
+
"""
|
|
57
|
+
Load KEY=VALUE pairs from .env in the current working directory.
|
|
58
|
+
Existing process environment values take precedence.
|
|
59
|
+
"""
|
|
60
|
+
env_path = Path(".env")
|
|
61
|
+
if not env_path.exists():
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
for raw_line in env_path.read_text(encoding="utf-8").splitlines():
|
|
66
|
+
line = raw_line.strip()
|
|
67
|
+
if not line or line.startswith("#") or "=" not in line:
|
|
68
|
+
continue
|
|
69
|
+
key, value = line.split("=", 1)
|
|
70
|
+
key = key.strip()
|
|
71
|
+
if not key or key in os.environ:
|
|
72
|
+
continue
|
|
73
|
+
cleaned = value.strip().strip('"').strip("'")
|
|
74
|
+
os.environ[key] = cleaned
|
|
75
|
+
except Exception as exc:
|
|
76
|
+
logger.warning("Failed to load .env file: %s", exc)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def load_recipe(recipe_path: str) -> dict[str, Any]:
|
|
80
|
+
path = Path(recipe_path)
|
|
81
|
+
if not path.exists():
|
|
82
|
+
logger.error("Recipe file not found at %s", recipe_path)
|
|
83
|
+
sys.exit(1)
|
|
84
|
+
|
|
85
|
+
with path.open("r", encoding="utf-8") as f:
|
|
86
|
+
try:
|
|
87
|
+
data = json.load(f)
|
|
88
|
+
if not isinstance(data, dict):
|
|
89
|
+
logger.error("Recipe JSON must be an object at top level")
|
|
90
|
+
sys.exit(1)
|
|
91
|
+
return cast(dict[str, Any], data)
|
|
92
|
+
except json.JSONDecodeError as e:
|
|
93
|
+
logger.error("Invalid JSON in recipe file: %s", e)
|
|
94
|
+
sys.exit(1)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _compute_findings_counts(
|
|
98
|
+
findings: list[Any],
|
|
99
|
+
) -> tuple[int, dict[str, int], dict[str, dict[str, int]]]:
|
|
100
|
+
"""Return (total, by_severity, by_detector) counts from a findings list."""
|
|
101
|
+
by_severity: dict[str, int] = {}
|
|
102
|
+
by_detector: dict[str, dict[str, int]] = {}
|
|
103
|
+
|
|
104
|
+
for f in findings:
|
|
105
|
+
severity = str(getattr(f, "severity", None) or "UNKNOWN")
|
|
106
|
+
detector = str(getattr(f, "detector_type", None) or "UNKNOWN")
|
|
107
|
+
|
|
108
|
+
by_severity[severity] = by_severity.get(severity, 0) + 1
|
|
109
|
+
|
|
110
|
+
entry = by_detector.setdefault(detector, {"total": 0})
|
|
111
|
+
entry["total"] += 1
|
|
112
|
+
entry[severity] = entry.get(severity, 0) + 1
|
|
113
|
+
|
|
114
|
+
return len(findings), by_severity, by_detector
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _asset_to_payload(asset: Any) -> dict[str, Any]:
|
|
118
|
+
if hasattr(asset, "model_dump"):
|
|
119
|
+
payload = asset.model_dump(mode="json", exclude_none=True)
|
|
120
|
+
if isinstance(payload, dict):
|
|
121
|
+
return cast(dict[str, Any], payload)
|
|
122
|
+
raise TypeError(f"model_dump() must return dict, got {type(payload)}")
|
|
123
|
+
if isinstance(asset, dict):
|
|
124
|
+
return cast(dict[str, Any], asset)
|
|
125
|
+
raise TypeError(f"Unsupported asset payload type: {type(asset)}")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) -> None:
|
|
129
|
+
"""Initialize the source and execute the specified command."""
|
|
130
|
+
runner_id = args.runner_id or os.environ.get("RUNNER_ID") or "local-run"
|
|
131
|
+
source_id = args.source_id or os.environ.get("SOURCE_ID")
|
|
132
|
+
if source_id:
|
|
133
|
+
os.environ["SOURCE_ID"] = source_id
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
try:
|
|
137
|
+
source = get_source(recipe, source_id=source_id, runner_id=runner_id)
|
|
138
|
+
except ValueError as e:
|
|
139
|
+
logger.error("Failed to initialize source: %s", e)
|
|
140
|
+
sys.exit(1)
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
if args.command == "test":
|
|
144
|
+
result = source.test_connection()
|
|
145
|
+
logger.info("Validating test connection output...")
|
|
146
|
+
try:
|
|
147
|
+
validate_test_connection(result)
|
|
148
|
+
logger.info("Test connection output is valid")
|
|
149
|
+
except Exception as validation_error:
|
|
150
|
+
logger.warning("Test connection output validation failed: %s", validation_error)
|
|
151
|
+
|
|
152
|
+
print(json.dumps(result, indent=2))
|
|
153
|
+
if result.get("status") == "FAILURE":
|
|
154
|
+
sys.exit(1)
|
|
155
|
+
|
|
156
|
+
elif args.command == "discover":
|
|
157
|
+
result = source.test_connection()
|
|
158
|
+
if result.get("status") == "FAILURE":
|
|
159
|
+
logger.error("Aborting: Connection test failed: %s", result.get("message"))
|
|
160
|
+
sys.exit(1)
|
|
161
|
+
|
|
162
|
+
logger.info("Discovering resources...")
|
|
163
|
+
data = source.discover()
|
|
164
|
+
print(json.dumps(data, indent=2))
|
|
165
|
+
|
|
166
|
+
elif args.command == "extract":
|
|
167
|
+
result = source.test_connection()
|
|
168
|
+
if result.get("status") == "FAILURE":
|
|
169
|
+
msg = result.get("message", "")
|
|
170
|
+
logger.error("Aborting: Connection test failed: %s", msg)
|
|
171
|
+
sys.exit(1)
|
|
172
|
+
|
|
173
|
+
logger.info("Starting extraction...")
|
|
174
|
+
sink = create_output_sink(args)
|
|
175
|
+
sink_started = False
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
await sink.start()
|
|
179
|
+
sink_started = True
|
|
180
|
+
|
|
181
|
+
from .pipeline.detector_pipeline import DetectorPipeline
|
|
182
|
+
|
|
183
|
+
pipeline = DetectorPipeline.from_recipe(
|
|
184
|
+
recipe,
|
|
185
|
+
source,
|
|
186
|
+
runner_id,
|
|
187
|
+
max_concurrent_assets=args.detector_max_concurrent,
|
|
188
|
+
)
|
|
189
|
+
has_detectors = bool(pipeline.detectors)
|
|
190
|
+
|
|
191
|
+
# --- Phase 1: Discovery ---
|
|
192
|
+
source.set_discovery_only(True)
|
|
193
|
+
all_stubs: list[Any] = []
|
|
194
|
+
total_assets = 0
|
|
195
|
+
output_batch_count = 0
|
|
196
|
+
|
|
197
|
+
async for raw_batch in source.extract_raw():
|
|
198
|
+
if not raw_batch:
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
batch_size = len(raw_batch)
|
|
202
|
+
total_assets += batch_size
|
|
203
|
+
|
|
204
|
+
stub_batch = [_asset_to_payload(asset) for asset in raw_batch]
|
|
205
|
+
for stub in stub_batch:
|
|
206
|
+
stub["findings"] = []
|
|
207
|
+
await sink.emit_batch(stub_batch, skip_findings=True)
|
|
208
|
+
output_batch_count += 1
|
|
209
|
+
|
|
210
|
+
hashes = [s["hash"] for s in stub_batch if s.get("hash")]
|
|
211
|
+
if hasattr(sink, "register_discovered_assets") and hashes:
|
|
212
|
+
await sink.register_discovered_assets(hashes)
|
|
213
|
+
|
|
214
|
+
all_stubs.extend(raw_batch)
|
|
215
|
+
logger.info(
|
|
216
|
+
"Discovered %s assets (total: %s)",
|
|
217
|
+
batch_size,
|
|
218
|
+
total_assets,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
source.set_discovery_only(False)
|
|
222
|
+
logger.info("Phase 1 complete: %d assets discovered", total_assets)
|
|
223
|
+
|
|
224
|
+
# --- Phase 2: Processing ---
|
|
225
|
+
if has_detectors and all_stubs:
|
|
226
|
+
import asyncio as _asyncio
|
|
227
|
+
|
|
228
|
+
workers = args.processing_workers
|
|
229
|
+
semaphore = _asyncio.Semaphore(workers)
|
|
230
|
+
processed_count = 0
|
|
231
|
+
error_count = 0
|
|
232
|
+
|
|
233
|
+
async def _process_one(asset: Any) -> None:
|
|
234
|
+
nonlocal processed_count, error_count
|
|
235
|
+
async with semaphore:
|
|
236
|
+
asset_hash = getattr(asset, "hash", None) or ""
|
|
237
|
+
try:
|
|
238
|
+
if hasattr(sink, "update_asset_status"):
|
|
239
|
+
await sink.update_asset_status(asset_hash, "PROCESSING")
|
|
240
|
+
|
|
241
|
+
async def _on_findings_flushed(partial: list[Any]) -> None:
|
|
242
|
+
# partial is the full accumulated findings list from the pipeline
|
|
243
|
+
stub_payload = _asset_to_payload(asset)
|
|
244
|
+
stub_payload["findings"] = [
|
|
245
|
+
f.model_dump(mode="json", exclude_none=True)
|
|
246
|
+
if hasattr(f, "model_dump")
|
|
247
|
+
else f
|
|
248
|
+
for f in partial
|
|
249
|
+
]
|
|
250
|
+
await sink.emit_batch([stub_payload], skip_findings=False)
|
|
251
|
+
if hasattr(sink, "update_asset_status"):
|
|
252
|
+
f_total, f_by_sev, f_by_det = _compute_findings_counts(
|
|
253
|
+
partial
|
|
254
|
+
)
|
|
255
|
+
await sink.update_asset_status(
|
|
256
|
+
asset_hash,
|
|
257
|
+
"PROCESSING",
|
|
258
|
+
findings_total=f_total,
|
|
259
|
+
findings_by_severity=f_by_sev,
|
|
260
|
+
findings_by_detector=f_by_det,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
result = await pipeline.process_single_asset(
|
|
264
|
+
asset,
|
|
265
|
+
on_findings_flushed=_on_findings_flushed,
|
|
266
|
+
findings_flush_size=args.detector_flush_batch_size,
|
|
267
|
+
)
|
|
268
|
+
payload = _asset_to_payload(result)
|
|
269
|
+
await sink.emit_batch([payload], skip_findings=False)
|
|
270
|
+
|
|
271
|
+
if hasattr(sink, "update_asset_status"):
|
|
272
|
+
f_total, f_by_sev, f_by_det = _compute_findings_counts(
|
|
273
|
+
result.findings or []
|
|
274
|
+
)
|
|
275
|
+
await sink.update_asset_status(
|
|
276
|
+
asset_hash,
|
|
277
|
+
"PROCESSED",
|
|
278
|
+
findings_total=f_total,
|
|
279
|
+
findings_by_severity=f_by_sev,
|
|
280
|
+
findings_by_detector=f_by_det,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
source.evict_asset_cache(asset_hash)
|
|
284
|
+
processed_count += 1
|
|
285
|
+
except Exception as exc:
|
|
286
|
+
error_count += 1
|
|
287
|
+
logger.error("Asset %s failed: %s", asset_hash, exc)
|
|
288
|
+
if hasattr(sink, "update_asset_status"):
|
|
289
|
+
try:
|
|
290
|
+
error_msg = str(exc) or type(exc).__name__
|
|
291
|
+
await sink.update_asset_status(
|
|
292
|
+
asset_hash, "ERROR", error_msg
|
|
293
|
+
)
|
|
294
|
+
except Exception:
|
|
295
|
+
pass
|
|
296
|
+
|
|
297
|
+
tasks = [_asyncio.create_task(_process_one(a)) for a in all_stubs]
|
|
298
|
+
await _asyncio.gather(*tasks, return_exceptions=True)
|
|
299
|
+
logger.info(
|
|
300
|
+
"Phase 2 complete: %d processed, %d errors",
|
|
301
|
+
processed_count,
|
|
302
|
+
error_count,
|
|
303
|
+
)
|
|
304
|
+
elif all_stubs and hasattr(sink, "update_asset_status"):
|
|
305
|
+
# No detectors configured: mark discovered assets as PROCESSED
|
|
306
|
+
import asyncio as _asyncio
|
|
307
|
+
|
|
308
|
+
async def _mark_processed(asset: Any) -> None:
|
|
309
|
+
asset_hash = getattr(asset, "hash", None) or ""
|
|
310
|
+
await sink.update_asset_status(
|
|
311
|
+
asset_hash, "PROCESSED", findings_total=0
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
tasks = [_asyncio.create_task(_mark_processed(a)) for a in all_stubs]
|
|
315
|
+
await _asyncio.gather(*tasks, return_exceptions=True)
|
|
316
|
+
logger.info(
|
|
317
|
+
"Phase 2 skipped (no detectors): %d assets marked processed",
|
|
318
|
+
len(all_stubs),
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
await sink.finish()
|
|
322
|
+
logger.info(
|
|
323
|
+
"Extraction completed: %s assets in %s batches",
|
|
324
|
+
total_assets,
|
|
325
|
+
output_batch_count,
|
|
326
|
+
)
|
|
327
|
+
except Exception as extraction_error:
|
|
328
|
+
if _is_timeout_error(extraction_error):
|
|
329
|
+
logger.warning(
|
|
330
|
+
"Source timed out during extraction, partial results flushed: %s",
|
|
331
|
+
extraction_error,
|
|
332
|
+
)
|
|
333
|
+
await sink.finish()
|
|
334
|
+
return
|
|
335
|
+
if sink_started:
|
|
336
|
+
try:
|
|
337
|
+
await sink.fail(extraction_error)
|
|
338
|
+
except Exception as sink_error:
|
|
339
|
+
logger.error(
|
|
340
|
+
"Failed to mark sink failure: %s", sink_error, exc_info=True
|
|
341
|
+
)
|
|
342
|
+
raise
|
|
343
|
+
|
|
344
|
+
except Exception as e:
|
|
345
|
+
logger.debug("Traceback for %s failure:", args.command, exc_info=True)
|
|
346
|
+
if _is_timeout_error(e):
|
|
347
|
+
logger.warning("SCAN TIMED OUT (source unreachable): %s", e)
|
|
348
|
+
return
|
|
349
|
+
logger.error("SCAN FAILED: %s", e)
|
|
350
|
+
sys.exit(1)
|
|
351
|
+
finally:
|
|
352
|
+
source.cleanup()
|
|
353
|
+
except Exception as e:
|
|
354
|
+
logger.debug("Traceback for fatal error:", exc_info=True)
|
|
355
|
+
logger.error("FATAL: %s", e)
|
|
356
|
+
sys.exit(1)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def run_command(args: argparse.Namespace, recipe: dict[str, Any]) -> None:
|
|
360
|
+
"""Wrapper to run async command."""
|
|
361
|
+
import asyncio
|
|
362
|
+
|
|
363
|
+
asyncio.run(run_command_async(args, recipe))
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def run_train_command(args: argparse.Namespace) -> None:
|
|
367
|
+
"""Fine-tune detector models on labeled training examples.
|
|
368
|
+
|
|
369
|
+
Reads pipeline_schema and examples from JSON files, runs GLiNER2 NER
|
|
370
|
+
fine-tuning and/or SetFit classification training, saves artifacts to
|
|
371
|
+
output_dir, and prints a JSON result to stdout for the API to consume.
|
|
372
|
+
"""
|
|
373
|
+
import json
|
|
374
|
+
from pathlib import Path
|
|
375
|
+
|
|
376
|
+
from .detectors.custom.trainer import GLiNER2Trainer
|
|
377
|
+
|
|
378
|
+
schema_path = Path(args.pipeline_schema)
|
|
379
|
+
examples_path = Path(args.examples)
|
|
380
|
+
output_dir = Path(args.output_dir)
|
|
381
|
+
|
|
382
|
+
if not schema_path.exists():
|
|
383
|
+
logger.error("Pipeline schema file not found: %s", schema_path)
|
|
384
|
+
sys.exit(1)
|
|
385
|
+
if not examples_path.exists():
|
|
386
|
+
logger.error("Examples file not found: %s", examples_path)
|
|
387
|
+
sys.exit(1)
|
|
388
|
+
|
|
389
|
+
try:
|
|
390
|
+
pipeline_schema: dict[str, Any] = json.loads(schema_path.read_text())
|
|
391
|
+
examples_raw: list[dict[str, Any]] = json.loads(examples_path.read_text())
|
|
392
|
+
except json.JSONDecodeError as e:
|
|
393
|
+
logger.error("Invalid JSON in input files: %s", e)
|
|
394
|
+
sys.exit(1)
|
|
395
|
+
|
|
396
|
+
trainer = GLiNER2Trainer(pipeline_schema, examples_raw, output_dir)
|
|
397
|
+
result = trainer.train()
|
|
398
|
+
print(json.dumps(result.to_dict()))
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def run_sandbox_command(args: argparse.Namespace) -> None:
|
|
402
|
+
"""Execute the sandbox command: parse file + run detectors."""
|
|
403
|
+
from .sandbox import SandboxRunner
|
|
404
|
+
|
|
405
|
+
file_path_str: str | None = args.recipe
|
|
406
|
+
if not file_path_str:
|
|
407
|
+
logger.error("sandbox command requires a file path as the first argument")
|
|
408
|
+
sys.exit(1)
|
|
409
|
+
|
|
410
|
+
file_path = Path(file_path_str)
|
|
411
|
+
if not file_path.exists():
|
|
412
|
+
logger.error("File not found: %s", file_path)
|
|
413
|
+
sys.exit(1)
|
|
414
|
+
|
|
415
|
+
detectors: list[dict[str, Any]] = []
|
|
416
|
+
if args.detectors_file:
|
|
417
|
+
detectors_path = Path(args.detectors_file)
|
|
418
|
+
if not detectors_path.exists():
|
|
419
|
+
logger.error("Detectors file not found: %s", detectors_path)
|
|
420
|
+
sys.exit(1)
|
|
421
|
+
try:
|
|
422
|
+
with detectors_path.open("r", encoding="utf-8") as f:
|
|
423
|
+
detectors = json.load(f)
|
|
424
|
+
if not isinstance(detectors, list):
|
|
425
|
+
logger.error("Detectors file must contain a JSON array")
|
|
426
|
+
sys.exit(1)
|
|
427
|
+
except json.JSONDecodeError as e:
|
|
428
|
+
logger.error("Invalid JSON in detectors file: %s", e)
|
|
429
|
+
sys.exit(1)
|
|
430
|
+
|
|
431
|
+
try:
|
|
432
|
+
runner = SandboxRunner(detectors)
|
|
433
|
+
parsed, findings = runner.run(file_path)
|
|
434
|
+
if parsed.parse_error:
|
|
435
|
+
logger.warning("File parse warning: %s", parsed.parse_error)
|
|
436
|
+
output: dict[str, Any] = {
|
|
437
|
+
"mime_type": parsed.mime_type,
|
|
438
|
+
"findings": [f.model_dump(mode="json") for f in findings],
|
|
439
|
+
}
|
|
440
|
+
if parsed.parse_error:
|
|
441
|
+
output["parse_error"] = parsed.parse_error
|
|
442
|
+
print(json.dumps(_sanitize_for_json(output), ensure_ascii=False))
|
|
443
|
+
except Exception as e:
|
|
444
|
+
logger.error("Sandbox run failed: %s", e, exc_info=True)
|
|
445
|
+
sys.exit(1)
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def main() -> None:
|
|
449
|
+
setup_logging()
|
|
450
|
+
load_local_env()
|
|
451
|
+
from .telemetry import init_telemetry
|
|
452
|
+
|
|
453
|
+
init_telemetry()
|
|
454
|
+
|
|
455
|
+
available_sources = list_available_sources()
|
|
456
|
+
|
|
457
|
+
parser = argparse.ArgumentParser(description="Classifyre Metadata Extraction CLI")
|
|
458
|
+
parser.add_argument(
|
|
459
|
+
"command",
|
|
460
|
+
choices=["test", "extract", "discover", "sandbox", "train"],
|
|
461
|
+
help="Command to run",
|
|
462
|
+
)
|
|
463
|
+
parser.add_argument(
|
|
464
|
+
"recipe",
|
|
465
|
+
nargs="?",
|
|
466
|
+
default=None,
|
|
467
|
+
help="Path to recipe JSON (or file path for sandbox command)",
|
|
468
|
+
)
|
|
469
|
+
parser.add_argument("--debug", action="store_true", help="Enable debug logging")
|
|
470
|
+
parser.add_argument(
|
|
471
|
+
"--output-type",
|
|
472
|
+
choices=["rest", "file", "console"],
|
|
473
|
+
default=None,
|
|
474
|
+
help="Output destination type for extract",
|
|
475
|
+
)
|
|
476
|
+
parser.add_argument(
|
|
477
|
+
"--output-batch-size",
|
|
478
|
+
type=int,
|
|
479
|
+
default=None,
|
|
480
|
+
help="Output batch size override (default: 20)",
|
|
481
|
+
)
|
|
482
|
+
parser.add_argument(
|
|
483
|
+
"--output-rest-url",
|
|
484
|
+
default=None,
|
|
485
|
+
help=(
|
|
486
|
+
"REST output base URL (defaults to CLASSIFYRE_OUTPUT_REST_URL, API_URL, "
|
|
487
|
+
"or http://localhost:8000)"
|
|
488
|
+
),
|
|
489
|
+
)
|
|
490
|
+
parser.add_argument(
|
|
491
|
+
"--output-file-path",
|
|
492
|
+
default=None,
|
|
493
|
+
help="File output path for NDJSON envelopes",
|
|
494
|
+
)
|
|
495
|
+
parser.add_argument("--source-id", default=None, help="Source UUID for REST output")
|
|
496
|
+
parser.add_argument("--runner-id", default=None, help="Runner UUID for REST output")
|
|
497
|
+
parser.add_argument(
|
|
498
|
+
"--managed-runner",
|
|
499
|
+
action="store_true",
|
|
500
|
+
help="Managed mode for API-orchestrated REST runs",
|
|
501
|
+
)
|
|
502
|
+
parser.add_argument(
|
|
503
|
+
"--detectors-file",
|
|
504
|
+
default=None,
|
|
505
|
+
help="Path to JSON file with detector configs (sandbox command only)",
|
|
506
|
+
)
|
|
507
|
+
# train-command arguments
|
|
508
|
+
parser.add_argument(
|
|
509
|
+
"--pipeline-schema",
|
|
510
|
+
default=None,
|
|
511
|
+
help="Path to pipeline schema JSON file (train command only)",
|
|
512
|
+
)
|
|
513
|
+
parser.add_argument(
|
|
514
|
+
"--examples",
|
|
515
|
+
default=None,
|
|
516
|
+
help="Path to training examples JSON file (train command only)",
|
|
517
|
+
)
|
|
518
|
+
parser.add_argument(
|
|
519
|
+
"--output-dir",
|
|
520
|
+
default=None,
|
|
521
|
+
help="Directory to write trained model artifacts (train command only)",
|
|
522
|
+
)
|
|
523
|
+
parser.add_argument(
|
|
524
|
+
"--detector-flush-batch-size",
|
|
525
|
+
type=int,
|
|
526
|
+
default=None,
|
|
527
|
+
help="How many detector-processed assets to accumulate before pushing findings to the API (default: 5, env: CLASSIFYRE_DETECTOR_FLUSH_BATCH_SIZE)",
|
|
528
|
+
)
|
|
529
|
+
parser.add_argument(
|
|
530
|
+
"--detector-max-concurrent",
|
|
531
|
+
type=int,
|
|
532
|
+
default=None,
|
|
533
|
+
help="Max assets processed in parallel by the detector pipeline (default: 10, env: CLASSIFYRE_DETECTOR_MAX_CONCURRENT)",
|
|
534
|
+
)
|
|
535
|
+
parser.add_argument(
|
|
536
|
+
"--processing-workers",
|
|
537
|
+
type=int,
|
|
538
|
+
default=None,
|
|
539
|
+
help="Number of parallel asset-processing workers in Phase 2 (default: 2, env: CLASSIFYRE_PROCESSING_WORKERS)",
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
args = parser.parse_args()
|
|
543
|
+
|
|
544
|
+
if args.detector_flush_batch_size is None:
|
|
545
|
+
env_val = os.environ.get("CLASSIFYRE_DETECTOR_FLUSH_BATCH_SIZE")
|
|
546
|
+
try:
|
|
547
|
+
args.detector_flush_batch_size = int(env_val) if env_val else 5
|
|
548
|
+
except ValueError:
|
|
549
|
+
args.detector_flush_batch_size = 5
|
|
550
|
+
args.detector_flush_batch_size = max(args.detector_flush_batch_size, 1)
|
|
551
|
+
|
|
552
|
+
if args.detector_max_concurrent is None:
|
|
553
|
+
env_val = os.environ.get("CLASSIFYRE_DETECTOR_MAX_CONCURRENT")
|
|
554
|
+
try:
|
|
555
|
+
args.detector_max_concurrent = int(env_val) if env_val else 10
|
|
556
|
+
except ValueError:
|
|
557
|
+
args.detector_max_concurrent = 10
|
|
558
|
+
args.detector_max_concurrent = max(args.detector_max_concurrent, 1)
|
|
559
|
+
|
|
560
|
+
if args.processing_workers is None:
|
|
561
|
+
env_val = os.environ.get("CLASSIFYRE_PROCESSING_WORKERS")
|
|
562
|
+
try:
|
|
563
|
+
args.processing_workers = int(env_val) if env_val else 2
|
|
564
|
+
except ValueError:
|
|
565
|
+
args.processing_workers = 2
|
|
566
|
+
args.processing_workers = max(args.processing_workers, 1)
|
|
567
|
+
|
|
568
|
+
if args.debug:
|
|
569
|
+
logging.getLogger().setLevel(logging.DEBUG)
|
|
570
|
+
|
|
571
|
+
if args.command == "sandbox":
|
|
572
|
+
run_sandbox_command(args)
|
|
573
|
+
return
|
|
574
|
+
|
|
575
|
+
if args.command == "train":
|
|
576
|
+
if not args.pipeline_schema or not args.examples or not args.output_dir:
|
|
577
|
+
logger.error("train requires --pipeline-schema, --examples, and --output-dir")
|
|
578
|
+
sys.exit(1)
|
|
579
|
+
run_train_command(args)
|
|
580
|
+
return
|
|
581
|
+
|
|
582
|
+
if not args.recipe:
|
|
583
|
+
logger.error("recipe argument is required for this command")
|
|
584
|
+
sys.exit(1)
|
|
585
|
+
|
|
586
|
+
recipe = load_recipe(args.recipe)
|
|
587
|
+
|
|
588
|
+
source_type = recipe.get("type", "").lower()
|
|
589
|
+
if not source_type:
|
|
590
|
+
logger.error(
|
|
591
|
+
"Recipe must have a 'type' field (e.g., 'WORDPRESS', 'SLACK', 'S3_COMPATIBLE_STORAGE')"
|
|
592
|
+
)
|
|
593
|
+
logger.info("Available source types: %s", ", ".join(available_sources))
|
|
594
|
+
sys.exit(1)
|
|
595
|
+
|
|
596
|
+
logger.info("Validating recipe for %s...", source_type)
|
|
597
|
+
try:
|
|
598
|
+
validate_input(recipe, source_type)
|
|
599
|
+
logger.info("Recipe is valid")
|
|
600
|
+
except Exception as e:
|
|
601
|
+
logger.error("Recipe validation failed: %s", e)
|
|
602
|
+
sys.exit(1)
|
|
603
|
+
|
|
604
|
+
run_command(args, recipe)
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
if __name__ == "__main__":
|
|
608
|
+
main()
|