cryptic-cti 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- classification/__init__.py +0 -0
- classification/utils.py +8 -0
- cryptic_cti-0.1.0.dist-info/METADATA +7 -0
- cryptic_cti-0.1.0.dist-info/RECORD +19 -0
- cryptic_cti-0.1.0.dist-info/WHEEL +5 -0
- cryptic_cti-0.1.0.dist-info/top_level.txt +8 -0
- extraction/__init__.py +0 -0
- extraction/base.py +6 -0
- extraction/engine.py +12 -0
- extraction/gliner_utils.py +40 -0
- extraction/spacy_utils.py +34 -0
- file_utils.py +34 -0
- models/gliner_model.py +11 -0
- normalization/__init__.py +0 -0
- normalization/utils.py +21 -0
- output/output_objects.py +106 -0
- preprocessing/__init__.py +0 -0
- preprocessing/chunking.py +98 -0
__init__.py
ADDED
|
File without changes
|
|
File without changes
|
classification/utils.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from setfit import SetFitModel
|
|
4
|
+
|
|
5
|
+
def load_model(model_dir: Path) -> SetFitModel:
|
|
6
|
+
if not model_dir.exists():
|
|
7
|
+
raise FileNotFoundError(f"SetFit model directory not found in: {model_dir}")
|
|
8
|
+
return SetFitModel.from_pretrained(str(model_dir), tokenizer_kwargs={"fix_mistral_regex": True})
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
file_utils.py,sha256=vDL4w0lNth2nlgjlGkGWGBg4O3KbR71TJIaHdqwmAiM,1240
|
|
3
|
+
classification/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
classification/utils.py,sha256=9eIK4DixydVfmxMlp2pR9mC-3x1Im5nDm9_97CBhofs,363
|
|
5
|
+
extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
extraction/base.py,sha256=MyS5T0QWT9Lf7qbhBXNbXrLfBMuEtXlkTjs1DEtJAvA,151
|
|
7
|
+
extraction/engine.py,sha256=7sdecf8YGkR7CLmmrgxsaloB-RrVlxev5cCkmYnFA_4,493
|
|
8
|
+
extraction/gliner_utils.py,sha256=TNbq-g2CbyZOT5rLeV1iJhPngnDFGbk6Ereulobj3cE,1361
|
|
9
|
+
extraction/spacy_utils.py,sha256=P_dWVKKMBVMW4gafjD3uWZ9m5zsmRTmKcew9SyHDtik,970
|
|
10
|
+
models/gliner_model.py,sha256=LGo59BkjYstgdE_znB8Ce3_vQmGnM_-2fmkMoYwLFHA,229
|
|
11
|
+
normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
normalization/utils.py,sha256=iOOkS2oBfHWzXczXlYRHh84tMxcO8gpQmc5mJU7OFGE,560
|
|
13
|
+
output/output_objects.py,sha256=wBLa7EeDX_qrJ47RL3rytPMrMPGYa00GysLWvtz7vKQ,4507
|
|
14
|
+
preprocessing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
preprocessing/chunking.py,sha256=IBK7zNwuAIV_6XCVuq7h3os0Ex8uGxNRNtZ3_hflP5I,2826
|
|
16
|
+
cryptic_cti-0.1.0.dist-info/METADATA,sha256=GkuyLavom3copHxYDXpg0WkeDwicrrxzIwCy8aG5mJA,244
|
|
17
|
+
cryptic_cti-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
18
|
+
cryptic_cti-0.1.0.dist-info/top_level.txt,sha256=RPUozcvsB6DIKdF7Q8gnsQqUrQTzmQGOQjnoee7hYlI,88
|
|
19
|
+
cryptic_cti-0.1.0.dist-info/RECORD,,
|
extraction/__init__.py
ADDED
|
File without changes
|
extraction/base.py
ADDED
extraction/engine.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from src.extraction.gliner_utils import GlinerRunner
|
|
2
|
+
from src.extraction.spacy_utils import SpacyRunner
|
|
3
|
+
|
|
4
|
+
class ExtractionEngine:
|
|
5
|
+
def __init__(self):
|
|
6
|
+
self.runners = {"spacy": SpacyRunner(), "gliner": GlinerRunner()} # future: RegexRunner(), etc.
|
|
7
|
+
def run(self, text: str) -> dict:
|
|
8
|
+
results = {}
|
|
9
|
+
for name, runner in self.runners:
|
|
10
|
+
name = runner.__class__.__name__.lower()
|
|
11
|
+
results[name] = runner.extract(text)
|
|
12
|
+
return results
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from src.models.gliner_model import get_gliner_model
|
|
3
|
+
from src.extraction.base import ExtractionRunner
|
|
4
|
+
from src.preprocessing.chunking import chunk_block_w_offsets, dedupe_entities
|
|
5
|
+
|
|
6
|
+
model_name = "urchade/gliner_medium-v2.1"
|
|
7
|
+
|
|
8
|
+
labels = [
|
|
9
|
+
"malware or tool name",
|
|
10
|
+
"credential theft activity",
|
|
11
|
+
"credential or data type",
|
|
12
|
+
"platform or application",
|
|
13
|
+
"actor or group name",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
def extract_candidates(text: str) -> list[dict]:
|
|
17
|
+
model = get_gliner_model()
|
|
18
|
+
chunks = chunk_block_w_offsets(text)
|
|
19
|
+
print(f"[extract_candidates] {len(chunks)} chunks | text len={len(text)}")
|
|
20
|
+
all_entities = []
|
|
21
|
+
for chunk in chunks:
|
|
22
|
+
results = model.predict_entities(chunk["text"], labels)
|
|
23
|
+
for result in results:
|
|
24
|
+
all_entities.append({
|
|
25
|
+
"text": result["text"],
|
|
26
|
+
"label": result["label"],
|
|
27
|
+
"score": float(result["score"]),
|
|
28
|
+
"start": chunk["start"] + result["start"],
|
|
29
|
+
"end": chunk["start"] + result["end"]
|
|
30
|
+
})
|
|
31
|
+
return dedupe_entities(all_entities)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class GlinerRunner(ExtractionRunner):
|
|
35
|
+
def __init__(self):
|
|
36
|
+
self.model = get_gliner_model()
|
|
37
|
+
self.labels = labels
|
|
38
|
+
def extract(self, text: str) -> list[dict]:
|
|
39
|
+
return extract_candidates(text)
|
|
40
|
+
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import spacy
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from collections_workflow.src.extraction.base import ExtractionRunner
|
|
6
|
+
|
|
7
|
+
nlp = spacy.load("en_core_web_sm")
|
|
8
|
+
|
|
9
|
+
def has_chinese(text: str) -> bool:
|
|
10
|
+
return bool(re.search(r"[\u4e00-\u9fff]", text))
|
|
11
|
+
|
|
12
|
+
def has_latin(text: str) -> bool:
|
|
13
|
+
return bool(re.search(r"[A-Za-z]", text))
|
|
14
|
+
|
|
15
|
+
def detect_lang(text: str) -> str:
|
|
16
|
+
zh = has_chinese(text)
|
|
17
|
+
en = has_latin(text)
|
|
18
|
+
if zh and not en:
|
|
19
|
+
return "zh"
|
|
20
|
+
elif en and not zh:
|
|
21
|
+
return "en"
|
|
22
|
+
else:
|
|
23
|
+
return "mixed"
|
|
24
|
+
return "unknown"
|
|
25
|
+
|
|
26
|
+
def spacy_prepare(text: str) -> dict:
|
|
27
|
+
doc = nlp(text)
|
|
28
|
+
sentences = list(doc.sents)
|
|
29
|
+
return {"lang": detect_lang(text), "sentence_count": len(sentences), "token_count": len(doc), "sentences": [sent.text.strip() for sent in sentences if sentences]}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SpacyRunner(ExtractionRunner):
|
|
33
|
+
def extract(self, text: str) -> dict:
|
|
34
|
+
return spacy_prepare(text)
|
file_utils.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
def write_jsonl(path: Path, records: list[dict]) -> None:
|
|
5
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
6
|
+
with path.open("w", encoding="utf-8") as f:
|
|
7
|
+
for record in records:
|
|
8
|
+
json_line = json.dumps(record, ensure_ascii=False)
|
|
9
|
+
f.write(json_line + "\n")
|
|
10
|
+
|
|
11
|
+
def read_jsonl(path: Path) -> list[dict]:
|
|
12
|
+
rows: list[dict] = []
|
|
13
|
+
with path.open("r", encoding="utf-8") as f:
|
|
14
|
+
for line_no, line in enumerate(f, start=1):
|
|
15
|
+
line = line.strip()
|
|
16
|
+
if not line:
|
|
17
|
+
continue
|
|
18
|
+
try:
|
|
19
|
+
rows.append(json.loads(line))
|
|
20
|
+
except json.JSONDecodeError as e:
|
|
21
|
+
raise ValueError(f"Invalid JSONL in {path} at line {line_no}: {e}") from e
|
|
22
|
+
return rows
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def latest_matching_file(directory: Path, pattern: str) -> Path:
|
|
26
|
+
matches = [p for p in directory.glob(pattern) if p.is_file()]
|
|
27
|
+
if not matches:
|
|
28
|
+
raise FileNotFoundError(f"No files found matching {pattern} in {directory}")
|
|
29
|
+
return max(matches, key=lambda p: p.stat().st_mtime)
|
|
30
|
+
|
|
31
|
+
def load_json(path: Path) -> dict:
|
|
32
|
+
with path.open("r", encoding="utf-8") as f:
|
|
33
|
+
return json.load(f)
|
|
34
|
+
|
models/gliner_model.py
ADDED
|
File without changes
|
normalization/utils.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
def normalize_key(value: str) -> str:
|
|
4
|
+
return " ".join(value.strip().lower().split())
|
|
5
|
+
|
|
6
|
+
def normalize_value(value: str, mapping: dict[str, str]) -> tuple[str, bool]:
|
|
7
|
+
cleaned = value.strip()
|
|
8
|
+
key = normalize_key(cleaned)
|
|
9
|
+
if key in mapping:
|
|
10
|
+
return mapping[key], True
|
|
11
|
+
return cleaned, False
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def dedupe_preserve_order(values: list) -> list:
|
|
15
|
+
seen = set()
|
|
16
|
+
out = []
|
|
17
|
+
for v in values:
|
|
18
|
+
if v not in seen:
|
|
19
|
+
seen.add(v)
|
|
20
|
+
out.append(v)
|
|
21
|
+
return out
|
output/output_objects.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import Any
|
|
6
|
+
from uuid import uuid4
|
|
7
|
+
|
|
8
|
+
def norm_fieldname(field_name: str) -> str:
|
|
9
|
+
return field_name.strip().lower()
|
|
10
|
+
|
|
11
|
+
def utc_now_iso() -> str:
|
|
12
|
+
return datetime.now(timezone.utc).isoformat()
|
|
13
|
+
|
|
14
|
+
@dataclass(slots=True)
|
|
15
|
+
class Relationship:
|
|
16
|
+
type: str
|
|
17
|
+
target_id: str
|
|
18
|
+
description: str = ""
|
|
19
|
+
def to_dict(self) -> dict[str, Any]:
|
|
20
|
+
return asdict(self)
|
|
21
|
+
|
|
22
|
+
@dataclass(slots=True)
|
|
23
|
+
class Output:
|
|
24
|
+
id: str = field(default_factory=lambda: str(uuid4()))
|
|
25
|
+
type: str = ""
|
|
26
|
+
generated_at: str = field(default_factory=utc_now_iso)
|
|
27
|
+
producer: str = ""
|
|
28
|
+
source_ids: list[str] = field(default_factory=list)
|
|
29
|
+
confidence: int | None = None
|
|
30
|
+
tlp: str = "TLP:CLEAR"
|
|
31
|
+
tags: list[str] = field(default_factory=list)
|
|
32
|
+
relationships: list[Relationship] = field(default_factory=list)
|
|
33
|
+
summary: str = ""
|
|
34
|
+
allowed_tlp = {"TLP:CLEAR", "TLP:GREEN", "TLP:AMBER", "TLP:RED"}
|
|
35
|
+
payload: dict[str, Any] = field(default_factory=dict)
|
|
36
|
+
|
|
37
|
+
def __post_init__(self) -> None:
|
|
38
|
+
if self.confidence is not None and not (0 <= self.confidence <= 100):
|
|
39
|
+
raise ValueError("confidence must be between 0 and 100")
|
|
40
|
+
self.source_ids = self.dedupe("source_ids")
|
|
41
|
+
self.tags = self.dedupe("tags")
|
|
42
|
+
if self.tlp not in self.allowed_tlp:
|
|
43
|
+
raise ValueError(
|
|
44
|
+
f"tlp must be one of {sorted(self.allowed_tlp)}, got {self.tlp!r}")
|
|
45
|
+
|
|
46
|
+
def dedupe(self, field_name: str) -> list[Any]:
|
|
47
|
+
field_name = norm_fieldname(field_name)
|
|
48
|
+
if field_name not in self.__dataclass_fields__:
|
|
49
|
+
raise ValueError(f"Invalid field name: {field_name}")
|
|
50
|
+
values = getattr(self, field_name)
|
|
51
|
+
if not isinstance(values, list):
|
|
52
|
+
raise ValueError(f"Field {field_name} is not a list, cannot dedupe")
|
|
53
|
+
out = []
|
|
54
|
+
for value in values:
|
|
55
|
+
if value not in out:
|
|
56
|
+
out.append(value)
|
|
57
|
+
return out
|
|
58
|
+
|
|
59
|
+
def set_field(self, field_name: str, value: Any) -> None:
|
|
60
|
+
field_name = norm_fieldname(field_name)
|
|
61
|
+
if field_name not in self.__dataclass_fields__:
|
|
62
|
+
raise ValueError(f"Invalid field name: {field_name}")
|
|
63
|
+
elif value is None:
|
|
64
|
+
raise ValueError(f"Value for {field_name} cannot be empty")
|
|
65
|
+
elif field_name in {"source_ids", "tags", "relationships"}:
|
|
66
|
+
raise TypeError(f"use .add_to() to add values to {field_name}")
|
|
67
|
+
elif field_name == "confidence" and not (0 <= value <= 100):
|
|
68
|
+
raise ValueError("confidence must be between 0 and 100")
|
|
69
|
+
elif field_name == "tlp" and value not in self.allowed_tlp:
|
|
70
|
+
raise ValueError(f"tlp must be one of {sorted(self.allowed_tlp)}, got {value!r}")
|
|
71
|
+
else:
|
|
72
|
+
setattr(self, field_name, value)
|
|
73
|
+
|
|
74
|
+
def add_to(self, field_name: str, value: Any) -> None:
|
|
75
|
+
field_name = norm_fieldname(field_name)
|
|
76
|
+
if field_name not in self.__dataclass_fields__:
|
|
77
|
+
raise ValueError(f"Invalid field name: {field_name}")
|
|
78
|
+
current_value = getattr(self, field_name)
|
|
79
|
+
if not isinstance(current_value, list):
|
|
80
|
+
raise ValueError(f"Field {field_name} is not a list, use .set_field() instead")
|
|
81
|
+
elif field_name == "relationships":
|
|
82
|
+
if isinstance(value, list):
|
|
83
|
+
if not all(isinstance(item, Relationship) for item in value):
|
|
84
|
+
raise TypeError(f"All items in {field_name} must be of type Relationship")
|
|
85
|
+
elif not isinstance(value, Relationship):
|
|
86
|
+
raise TypeError(f"relationships must be of type Relationship, got {type(value)}")
|
|
87
|
+
if isinstance(value, list):
|
|
88
|
+
current_value.extend(value)
|
|
89
|
+
else:
|
|
90
|
+
current_value.append(value)
|
|
91
|
+
current_value[:] = self.dedupe(field_name)
|
|
92
|
+
|
|
93
|
+
def to_dict(self) -> dict[str, Any]:
|
|
94
|
+
return {
|
|
95
|
+
"id": self.id,
|
|
96
|
+
"type": self.type,
|
|
97
|
+
"generated_at": self.generated_at,
|
|
98
|
+
"producer": self.producer,
|
|
99
|
+
"source_ids": self.source_ids,
|
|
100
|
+
"confidence": self.confidence,
|
|
101
|
+
"tlp": self.tlp,
|
|
102
|
+
"tags": self.tags,
|
|
103
|
+
"relationships": [rel.to_dict() for rel in self.relationships],
|
|
104
|
+
"summary": self.summary,
|
|
105
|
+
"payload": self.payload,
|
|
106
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
print(f"starting spacy import...")
|
|
2
|
+
import spacy
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
_nlp = None
|
|
6
|
+
def get_nlp():
|
|
7
|
+
global _nlp
|
|
8
|
+
if _nlp is None:
|
|
9
|
+
_nlp = spacy.load("en_core_web_sm")
|
|
10
|
+
return _nlp
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def sentence_chunks(text: str, target_chars: int = 900, overlap_sentences: int = 1) -> list[str]:
|
|
14
|
+
nlp = get_nlp()
|
|
15
|
+
doc = nlp(text)
|
|
16
|
+
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
|
|
17
|
+
if not sentences:
|
|
18
|
+
stripped = text.strip()
|
|
19
|
+
return [stripped] if stripped else []
|
|
20
|
+
chunks: list[str] = []
|
|
21
|
+
current: list[str] = []
|
|
22
|
+
i = 0
|
|
23
|
+
while i < len(sentences):
|
|
24
|
+
current = []
|
|
25
|
+
current_len = 0
|
|
26
|
+
start_i = i
|
|
27
|
+
while i < len(sentences):
|
|
28
|
+
sentence = sentences[i]
|
|
29
|
+
projected = current_len + len(sentence) + (1 if current else 0)
|
|
30
|
+
if current and projected > target_chars:
|
|
31
|
+
break
|
|
32
|
+
current.append(sentence)
|
|
33
|
+
current_len = projected
|
|
34
|
+
i += 1
|
|
35
|
+
chunk = " ".join(current).strip()
|
|
36
|
+
if chunk:
|
|
37
|
+
chunks.append(chunk)
|
|
38
|
+
if i >= len(sentences):
|
|
39
|
+
break
|
|
40
|
+
i = max(start_i + 1, i - overlap_sentences)
|
|
41
|
+
return chunks
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def char_chunks(text: str, target_chars: int = 900, overlap_chars: int = 150) -> list[str]:
|
|
45
|
+
text = text.strip()
|
|
46
|
+
if not text:
|
|
47
|
+
return []
|
|
48
|
+
chunks = []
|
|
49
|
+
start = 0
|
|
50
|
+
n = len(text)
|
|
51
|
+
while start < n:
|
|
52
|
+
end = min(start + target_chars, n)
|
|
53
|
+
chunk = text[start:end].strip()
|
|
54
|
+
if chunk:
|
|
55
|
+
chunks.append(chunk)
|
|
56
|
+
if end >= n:
|
|
57
|
+
break
|
|
58
|
+
start = max(end - overlap_chars, start + 1)
|
|
59
|
+
return chunks
|
|
60
|
+
|
|
61
|
+
def chunk_block(block: str) -> list[str]:
|
|
62
|
+
block = block.strip()
|
|
63
|
+
if not block:
|
|
64
|
+
return []
|
|
65
|
+
if len(block) <= 1200 and "\n" not in block:
|
|
66
|
+
return [block]
|
|
67
|
+
if block.count(".") > 2:
|
|
68
|
+
try:
|
|
69
|
+
return sentence_chunks(block, target_chars=700, overlap_sentences=2)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"Error during sentence chunking: {e}")
|
|
72
|
+
pass
|
|
73
|
+
return char_chunks(block, target_chars=700, overlap_chars=200)
|
|
74
|
+
|
|
75
|
+
def chunk_block_w_offsets(block: str):
|
|
76
|
+
chunks = []
|
|
77
|
+
offset = 0
|
|
78
|
+
raw_chunks = chunk_block(block)
|
|
79
|
+
for chunk in raw_chunks:
|
|
80
|
+
start = block.find(chunk, offset)
|
|
81
|
+
end = start + len(chunk)
|
|
82
|
+
chunks.append({
|
|
83
|
+
"text": chunk,
|
|
84
|
+
"start": start,
|
|
85
|
+
"end": end
|
|
86
|
+
})
|
|
87
|
+
offset = end
|
|
88
|
+
return chunks
|
|
89
|
+
|
|
90
|
+
def dedupe_entities(entities):
|
|
91
|
+
seen = set()
|
|
92
|
+
result = []
|
|
93
|
+
for e in entities:
|
|
94
|
+
key = (e["text"], e["start"], e["end"], e["label"])
|
|
95
|
+
if key not in seen:
|
|
96
|
+
seen.add(key)
|
|
97
|
+
result.append(e)
|
|
98
|
+
return result
|