cryptic-cti 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cryptic_cti-0.1.0/PKG-INFO +7 -0
- cryptic_cti-0.1.0/README.md +0 -0
- cryptic_cti-0.1.0/pyproject.toml +13 -0
- cryptic_cti-0.1.0/setup.cfg +4 -0
- cryptic_cti-0.1.0/src/__init__.py +0 -0
- cryptic_cti-0.1.0/src/classification/__init__.py +0 -0
- cryptic_cti-0.1.0/src/classification/utils.py +8 -0
- cryptic_cti-0.1.0/src/cryptic_cti.egg-info/PKG-INFO +7 -0
- cryptic_cti-0.1.0/src/cryptic_cti.egg-info/SOURCES.txt +25 -0
- cryptic_cti-0.1.0/src/cryptic_cti.egg-info/dependency_links.txt +1 -0
- cryptic_cti-0.1.0/src/cryptic_cti.egg-info/top_level.txt +8 -0
- cryptic_cti-0.1.0/src/extraction/__init__.py +0 -0
- cryptic_cti-0.1.0/src/extraction/base.py +6 -0
- cryptic_cti-0.1.0/src/extraction/engine.py +12 -0
- cryptic_cti-0.1.0/src/extraction/gliner_utils.py +40 -0
- cryptic_cti-0.1.0/src/extraction/spacy_utils.py +34 -0
- cryptic_cti-0.1.0/src/file_utils.py +34 -0
- cryptic_cti-0.1.0/src/models/gliner_model.py +11 -0
- cryptic_cti-0.1.0/src/normalization/__init__.py +0 -0
- cryptic_cti-0.1.0/src/normalization/utils.py +21 -0
- cryptic_cti-0.1.0/src/output/output_objects.py +106 -0
- cryptic_cti-0.1.0/src/preprocessing/__init__.py +0 -0
- cryptic_cti-0.1.0/src/preprocessing/chunking.py +98 -0
- cryptic_cti-0.1.0/tests/test_ctier_parser.py +33 -0
- cryptic_cti-0.1.0/tests/test_parser_utils.py +46 -0
- cryptic_cti-0.1.0/tests/test_semantex_smoke.py +11 -0
- cryptic_cti-0.1.0/tests/test_spacy_utils.py +23 -0
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "cryptic-cti"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Multilingual CTI collections pipeline for normalizing and structuring cybercrime leads"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name="Cosmic Octopus" }
|
|
13
|
+
]
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from setfit import SetFitModel
|
|
4
|
+
|
|
5
|
+
def load_model(model_dir: Path) -> SetFitModel:
|
|
6
|
+
if not model_dir.exists():
|
|
7
|
+
raise FileNotFoundError(f"SetFit model directory not found in: {model_dir}")
|
|
8
|
+
return SetFitModel.from_pretrained(str(model_dir), tokenizer_kwargs={"fix_mistral_regex": True})
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/__init__.py
|
|
4
|
+
src/file_utils.py
|
|
5
|
+
src/classification/__init__.py
|
|
6
|
+
src/classification/utils.py
|
|
7
|
+
src/cryptic_cti.egg-info/PKG-INFO
|
|
8
|
+
src/cryptic_cti.egg-info/SOURCES.txt
|
|
9
|
+
src/cryptic_cti.egg-info/dependency_links.txt
|
|
10
|
+
src/cryptic_cti.egg-info/top_level.txt
|
|
11
|
+
src/extraction/__init__.py
|
|
12
|
+
src/extraction/base.py
|
|
13
|
+
src/extraction/engine.py
|
|
14
|
+
src/extraction/gliner_utils.py
|
|
15
|
+
src/extraction/spacy_utils.py
|
|
16
|
+
src/models/gliner_model.py
|
|
17
|
+
src/normalization/__init__.py
|
|
18
|
+
src/normalization/utils.py
|
|
19
|
+
src/output/output_objects.py
|
|
20
|
+
src/preprocessing/__init__.py
|
|
21
|
+
src/preprocessing/chunking.py
|
|
22
|
+
tests/test_ctier_parser.py
|
|
23
|
+
tests/test_parser_utils.py
|
|
24
|
+
tests/test_semantex_smoke.py
|
|
25
|
+
tests/test_spacy_utils.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from src.extraction.gliner_utils import GlinerRunner
|
|
2
|
+
from src.extraction.spacy_utils import SpacyRunner
|
|
3
|
+
|
|
4
|
+
class ExtractionEngine:
|
|
5
|
+
def __init__(self):
|
|
6
|
+
self.runners = {"spacy": SpacyRunner(), "gliner": GlinerRunner()} # future: RegexRunner(), etc.
|
|
7
|
+
def run(self, text: str) -> dict:
|
|
8
|
+
results = {}
|
|
9
|
+
for name, runner in self.runners:
|
|
10
|
+
name = runner.__class__.__name__.lower()
|
|
11
|
+
results[name] = runner.extract(text)
|
|
12
|
+
return results
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from src.models.gliner_model import get_gliner_model
|
|
3
|
+
from src.extraction.base import ExtractionRunner
|
|
4
|
+
from src.preprocessing.chunking import chunk_block_w_offsets, dedupe_entities
|
|
5
|
+
|
|
6
|
+
model_name = "urchade/gliner_medium-v2.1"
|
|
7
|
+
|
|
8
|
+
labels = [
|
|
9
|
+
"malware or tool name",
|
|
10
|
+
"credential theft activity",
|
|
11
|
+
"credential or data type",
|
|
12
|
+
"platform or application",
|
|
13
|
+
"actor or group name",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
def extract_candidates(text: str) -> list[dict]:
|
|
17
|
+
model = get_gliner_model()
|
|
18
|
+
chunks = chunk_block_w_offsets(text)
|
|
19
|
+
print(f"[extract_candidates] {len(chunks)} chunks | text len={len(text)}")
|
|
20
|
+
all_entities = []
|
|
21
|
+
for chunk in chunks:
|
|
22
|
+
results = model.predict_entities(chunk["text"], labels)
|
|
23
|
+
for result in results:
|
|
24
|
+
all_entities.append({
|
|
25
|
+
"text": result["text"],
|
|
26
|
+
"label": result["label"],
|
|
27
|
+
"score": float(result["score"]),
|
|
28
|
+
"start": chunk["start"] + result["start"],
|
|
29
|
+
"end": chunk["start"] + result["end"]
|
|
30
|
+
})
|
|
31
|
+
return dedupe_entities(all_entities)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class GlinerRunner(ExtractionRunner):
|
|
35
|
+
def __init__(self):
|
|
36
|
+
self.model = get_gliner_model()
|
|
37
|
+
self.labels = labels
|
|
38
|
+
def extract(self, text: str) -> list[dict]:
|
|
39
|
+
return extract_candidates(text)
|
|
40
|
+
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import spacy
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from collections_workflow.src.extraction.base import ExtractionRunner
|
|
6
|
+
|
|
7
|
+
nlp = spacy.load("en_core_web_sm")
|
|
8
|
+
|
|
9
|
+
def has_chinese(text: str) -> bool:
|
|
10
|
+
return bool(re.search(r"[\u4e00-\u9fff]", text))
|
|
11
|
+
|
|
12
|
+
def has_latin(text: str) -> bool:
|
|
13
|
+
return bool(re.search(r"[A-Za-z]", text))
|
|
14
|
+
|
|
15
|
+
def detect_lang(text: str) -> str:
|
|
16
|
+
zh = has_chinese(text)
|
|
17
|
+
en = has_latin(text)
|
|
18
|
+
if zh and not en:
|
|
19
|
+
return "zh"
|
|
20
|
+
elif en and not zh:
|
|
21
|
+
return "en"
|
|
22
|
+
else:
|
|
23
|
+
return "mixed"
|
|
24
|
+
return "unknown"
|
|
25
|
+
|
|
26
|
+
def spacy_prepare(text: str) -> dict:
|
|
27
|
+
doc = nlp(text)
|
|
28
|
+
sentences = list(doc.sents)
|
|
29
|
+
return {"lang": detect_lang(text), "sentence_count": len(sentences), "token_count": len(doc), "sentences": [sent.text.strip() for sent in sentences if sentences]}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SpacyRunner(ExtractionRunner):
|
|
33
|
+
def extract(self, text: str) -> dict:
|
|
34
|
+
return spacy_prepare(text)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
def write_jsonl(path: Path, records: list[dict]) -> None:
|
|
5
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
6
|
+
with path.open("w", encoding="utf-8") as f:
|
|
7
|
+
for record in records:
|
|
8
|
+
json_line = json.dumps(record, ensure_ascii=False)
|
|
9
|
+
f.write(json_line + "\n")
|
|
10
|
+
|
|
11
|
+
def read_jsonl(path: Path) -> list[dict]:
|
|
12
|
+
rows: list[dict] = []
|
|
13
|
+
with path.open("r", encoding="utf-8") as f:
|
|
14
|
+
for line_no, line in enumerate(f, start=1):
|
|
15
|
+
line = line.strip()
|
|
16
|
+
if not line:
|
|
17
|
+
continue
|
|
18
|
+
try:
|
|
19
|
+
rows.append(json.loads(line))
|
|
20
|
+
except json.JSONDecodeError as e:
|
|
21
|
+
raise ValueError(f"Invalid JSONL in {path} at line {line_no}: {e}") from e
|
|
22
|
+
return rows
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def latest_matching_file(directory: Path, pattern: str) -> Path:
|
|
26
|
+
matches = [p for p in directory.glob(pattern) if p.is_file()]
|
|
27
|
+
if not matches:
|
|
28
|
+
raise FileNotFoundError(f"No files found matching {pattern} in {directory}")
|
|
29
|
+
return max(matches, key=lambda p: p.stat().st_mtime)
|
|
30
|
+
|
|
31
|
+
def load_json(path: Path) -> dict:
|
|
32
|
+
with path.open("r", encoding="utf-8") as f:
|
|
33
|
+
return json.load(f)
|
|
34
|
+
|
|
File without changes
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
def normalize_key(value: str) -> str:
|
|
4
|
+
return " ".join(value.strip().lower().split())
|
|
5
|
+
|
|
6
|
+
def normalize_value(value: str, mapping: dict[str, str]) -> tuple[str, bool]:
|
|
7
|
+
cleaned = value.strip()
|
|
8
|
+
key = normalize_key(cleaned)
|
|
9
|
+
if key in mapping:
|
|
10
|
+
return mapping[key], True
|
|
11
|
+
return cleaned, False
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def dedupe_preserve_order(values: list) -> list:
|
|
15
|
+
seen = set()
|
|
16
|
+
out = []
|
|
17
|
+
for v in values:
|
|
18
|
+
if v not in seen:
|
|
19
|
+
seen.add(v)
|
|
20
|
+
out.append(v)
|
|
21
|
+
return out
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import Any
|
|
6
|
+
from uuid import uuid4
|
|
7
|
+
|
|
8
|
+
def norm_fieldname(field_name: str) -> str:
|
|
9
|
+
return field_name.strip().lower()
|
|
10
|
+
|
|
11
|
+
def utc_now_iso() -> str:
|
|
12
|
+
return datetime.now(timezone.utc).isoformat()
|
|
13
|
+
|
|
14
|
+
@dataclass(slots=True)
|
|
15
|
+
class Relationship:
|
|
16
|
+
type: str
|
|
17
|
+
target_id: str
|
|
18
|
+
description: str = ""
|
|
19
|
+
def to_dict(self) -> dict[str, Any]:
|
|
20
|
+
return asdict(self)
|
|
21
|
+
|
|
22
|
+
@dataclass(slots=True)
|
|
23
|
+
class Output:
|
|
24
|
+
id: str = field(default_factory=lambda: str(uuid4()))
|
|
25
|
+
type: str = ""
|
|
26
|
+
generated_at: str = field(default_factory=utc_now_iso)
|
|
27
|
+
producer: str = ""
|
|
28
|
+
source_ids: list[str] = field(default_factory=list)
|
|
29
|
+
confidence: int | None = None
|
|
30
|
+
tlp: str = "TLP:CLEAR"
|
|
31
|
+
tags: list[str] = field(default_factory=list)
|
|
32
|
+
relationships: list[Relationship] = field(default_factory=list)
|
|
33
|
+
summary: str = ""
|
|
34
|
+
allowed_tlp = {"TLP:CLEAR", "TLP:GREEN", "TLP:AMBER", "TLP:RED"}
|
|
35
|
+
payload: dict[str, Any] = field(default_factory=dict)
|
|
36
|
+
|
|
37
|
+
def __post_init__(self) -> None:
|
|
38
|
+
if self.confidence is not None and not (0 <= self.confidence <= 100):
|
|
39
|
+
raise ValueError("confidence must be between 0 and 100")
|
|
40
|
+
self.source_ids = self.dedupe("source_ids")
|
|
41
|
+
self.tags = self.dedupe("tags")
|
|
42
|
+
if self.tlp not in self.allowed_tlp:
|
|
43
|
+
raise ValueError(
|
|
44
|
+
f"tlp must be one of {sorted(self.allowed_tlp)}, got {self.tlp!r}")
|
|
45
|
+
|
|
46
|
+
def dedupe(self, field_name: str) -> list[Any]:
|
|
47
|
+
field_name = norm_fieldname(field_name)
|
|
48
|
+
if field_name not in self.__dataclass_fields__:
|
|
49
|
+
raise ValueError(f"Invalid field name: {field_name}")
|
|
50
|
+
values = getattr(self, field_name)
|
|
51
|
+
if not isinstance(values, list):
|
|
52
|
+
raise ValueError(f"Field {field_name} is not a list, cannot dedupe")
|
|
53
|
+
out = []
|
|
54
|
+
for value in values:
|
|
55
|
+
if value not in out:
|
|
56
|
+
out.append(value)
|
|
57
|
+
return out
|
|
58
|
+
|
|
59
|
+
def set_field(self, field_name: str, value: Any) -> None:
|
|
60
|
+
field_name = norm_fieldname(field_name)
|
|
61
|
+
if field_name not in self.__dataclass_fields__:
|
|
62
|
+
raise ValueError(f"Invalid field name: {field_name}")
|
|
63
|
+
elif value is None:
|
|
64
|
+
raise ValueError(f"Value for {field_name} cannot be empty")
|
|
65
|
+
elif field_name in {"source_ids", "tags", "relationships"}:
|
|
66
|
+
raise TypeError(f"use .add_to() to add values to {field_name}")
|
|
67
|
+
elif field_name == "confidence" and not (0 <= value <= 100):
|
|
68
|
+
raise ValueError("confidence must be between 0 and 100")
|
|
69
|
+
elif field_name == "tlp" and value not in self.allowed_tlp:
|
|
70
|
+
raise ValueError(f"tlp must be one of {sorted(self.allowed_tlp)}, got {value!r}")
|
|
71
|
+
else:
|
|
72
|
+
setattr(self, field_name, value)
|
|
73
|
+
|
|
74
|
+
def add_to(self, field_name: str, value: Any) -> None:
|
|
75
|
+
field_name = norm_fieldname(field_name)
|
|
76
|
+
if field_name not in self.__dataclass_fields__:
|
|
77
|
+
raise ValueError(f"Invalid field name: {field_name}")
|
|
78
|
+
current_value = getattr(self, field_name)
|
|
79
|
+
if not isinstance(current_value, list):
|
|
80
|
+
raise ValueError(f"Field {field_name} is not a list, use .set_field() instead")
|
|
81
|
+
elif field_name == "relationships":
|
|
82
|
+
if isinstance(value, list):
|
|
83
|
+
if not all(isinstance(item, Relationship) for item in value):
|
|
84
|
+
raise TypeError(f"All items in {field_name} must be of type Relationship")
|
|
85
|
+
elif not isinstance(value, Relationship):
|
|
86
|
+
raise TypeError(f"relationships must be of type Relationship, got {type(value)}")
|
|
87
|
+
if isinstance(value, list):
|
|
88
|
+
current_value.extend(value)
|
|
89
|
+
else:
|
|
90
|
+
current_value.append(value)
|
|
91
|
+
current_value[:] = self.dedupe(field_name)
|
|
92
|
+
|
|
93
|
+
def to_dict(self) -> dict[str, Any]:
|
|
94
|
+
return {
|
|
95
|
+
"id": self.id,
|
|
96
|
+
"type": self.type,
|
|
97
|
+
"generated_at": self.generated_at,
|
|
98
|
+
"producer": self.producer,
|
|
99
|
+
"source_ids": self.source_ids,
|
|
100
|
+
"confidence": self.confidence,
|
|
101
|
+
"tlp": self.tlp,
|
|
102
|
+
"tags": self.tags,
|
|
103
|
+
"relationships": [rel.to_dict() for rel in self.relationships],
|
|
104
|
+
"summary": self.summary,
|
|
105
|
+
"payload": self.payload,
|
|
106
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
print(f"starting spacy import...")
|
|
2
|
+
import spacy
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
_nlp = None
|
|
6
|
+
def get_nlp():
|
|
7
|
+
global _nlp
|
|
8
|
+
if _nlp is None:
|
|
9
|
+
_nlp = spacy.load("en_core_web_sm")
|
|
10
|
+
return _nlp
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def sentence_chunks(text: str, target_chars: int = 900, overlap_sentences: int = 1) -> list[str]:
|
|
14
|
+
nlp = get_nlp()
|
|
15
|
+
doc = nlp(text)
|
|
16
|
+
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
|
|
17
|
+
if not sentences:
|
|
18
|
+
stripped = text.strip()
|
|
19
|
+
return [stripped] if stripped else []
|
|
20
|
+
chunks: list[str] = []
|
|
21
|
+
current: list[str] = []
|
|
22
|
+
i = 0
|
|
23
|
+
while i < len(sentences):
|
|
24
|
+
current = []
|
|
25
|
+
current_len = 0
|
|
26
|
+
start_i = i
|
|
27
|
+
while i < len(sentences):
|
|
28
|
+
sentence = sentences[i]
|
|
29
|
+
projected = current_len + len(sentence) + (1 if current else 0)
|
|
30
|
+
if current and projected > target_chars:
|
|
31
|
+
break
|
|
32
|
+
current.append(sentence)
|
|
33
|
+
current_len = projected
|
|
34
|
+
i += 1
|
|
35
|
+
chunk = " ".join(current).strip()
|
|
36
|
+
if chunk:
|
|
37
|
+
chunks.append(chunk)
|
|
38
|
+
if i >= len(sentences):
|
|
39
|
+
break
|
|
40
|
+
i = max(start_i + 1, i - overlap_sentences)
|
|
41
|
+
return chunks
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def char_chunks(text: str, target_chars: int = 900, overlap_chars: int = 150) -> list[str]:
|
|
45
|
+
text = text.strip()
|
|
46
|
+
if not text:
|
|
47
|
+
return []
|
|
48
|
+
chunks = []
|
|
49
|
+
start = 0
|
|
50
|
+
n = len(text)
|
|
51
|
+
while start < n:
|
|
52
|
+
end = min(start + target_chars, n)
|
|
53
|
+
chunk = text[start:end].strip()
|
|
54
|
+
if chunk:
|
|
55
|
+
chunks.append(chunk)
|
|
56
|
+
if end >= n:
|
|
57
|
+
break
|
|
58
|
+
start = max(end - overlap_chars, start + 1)
|
|
59
|
+
return chunks
|
|
60
|
+
|
|
61
|
+
def chunk_block(block: str) -> list[str]:
|
|
62
|
+
block = block.strip()
|
|
63
|
+
if not block:
|
|
64
|
+
return []
|
|
65
|
+
if len(block) <= 1200 and "\n" not in block:
|
|
66
|
+
return [block]
|
|
67
|
+
if block.count(".") > 2:
|
|
68
|
+
try:
|
|
69
|
+
return sentence_chunks(block, target_chars=700, overlap_sentences=2)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"Error during sentence chunking: {e}")
|
|
72
|
+
pass
|
|
73
|
+
return char_chunks(block, target_chars=700, overlap_chars=200)
|
|
74
|
+
|
|
75
|
+
def chunk_block_w_offsets(block: str):
|
|
76
|
+
chunks = []
|
|
77
|
+
offset = 0
|
|
78
|
+
raw_chunks = chunk_block(block)
|
|
79
|
+
for chunk in raw_chunks:
|
|
80
|
+
start = block.find(chunk, offset)
|
|
81
|
+
end = start + len(chunk)
|
|
82
|
+
chunks.append({
|
|
83
|
+
"text": chunk,
|
|
84
|
+
"start": start,
|
|
85
|
+
"end": end
|
|
86
|
+
})
|
|
87
|
+
offset = end
|
|
88
|
+
return chunks
|
|
89
|
+
|
|
90
|
+
def dedupe_entities(entities):
|
|
91
|
+
seen = set()
|
|
92
|
+
result = []
|
|
93
|
+
for e in entities:
|
|
94
|
+
key = (e["text"], e["start"], e["end"], e["label"])
|
|
95
|
+
if key not in seen:
|
|
96
|
+
seen.add(key)
|
|
97
|
+
result.append(e)
|
|
98
|
+
return result
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from pipeline.metadata_ctier import parse_corpus
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def write_batch(corpus_dir: Path, batch_name: str, entries: list[str]) -> Path:
|
|
6
|
+
batch_file = corpus_dir / batch_name
|
|
7
|
+
joined = "\n\n".join(f"-----\n{entry.strip()}" for entry in entries)
|
|
8
|
+
batch_file.write_text(joined, encoding="utf-8")
|
|
9
|
+
return batch_file
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_batch_read(tmp_path: Path):
|
|
13
|
+
corpus_dir = tmp_path / "corpus"
|
|
14
|
+
corpus_dir.mkdir()
|
|
15
|
+
write_batch(
|
|
16
|
+
corpus_dir,
|
|
17
|
+
"batch.1",
|
|
18
|
+
[
|
|
19
|
+
'3356: Proofpoint observed a spear-phishing campaign spreading Vega Stealer.',
|
|
20
|
+
"""['11345: "follow.user steals data and credentials"', [['follow.user', [1, 2], 'MW']]]""",
|
|
21
|
+
],
|
|
22
|
+
)
|
|
23
|
+
records = parse_corpus(corpus_dir=corpus_dir)
|
|
24
|
+
assert len(records) == 2
|
|
25
|
+
assert records[0]["id"] == "ctier_batch1_001"
|
|
26
|
+
assert records[0]["source"] == "ctier"
|
|
27
|
+
assert records[0]["source_file"] == "batch.1"
|
|
28
|
+
assert records[0]["entry_index"] == 1
|
|
29
|
+
assert records[0]["format"] == "text_block"
|
|
30
|
+
assert records[1]["id"] == "ctier_batch1_002"
|
|
31
|
+
assert records[1]["format"] == "nested_list"
|
|
32
|
+
assert records[1]["source_file"] == "batch.1"
|
|
33
|
+
assert records[1]["entry_index"] == 2
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from pipeline.metadata_ctier import detect_format, split_entries, build_record_id
|
|
3
|
+
|
|
4
|
+
sample = """
|
|
5
|
+
-----
|
|
6
|
+
entry one
|
|
7
|
+
|
|
8
|
+
-----
|
|
9
|
+
entry two
|
|
10
|
+
|
|
11
|
+
-----
|
|
12
|
+
entry three
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_detect_format_nested_list():
|
|
17
|
+
entry = """['3356: "Vega Stealer..."', [['Vega Stealer', [10, 12], 'MW']]]"""
|
|
18
|
+
assert detect_format(entry) == "nested_list"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_detect_format_text_block():
|
|
22
|
+
entry = "3356: Proofpoint observed a spear-phishing campaign spreading Vega Stealer."
|
|
23
|
+
assert detect_format(entry) == "text_block"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_detect_format_empty_raises():
|
|
27
|
+
with pytest.raises(ValueError):
|
|
28
|
+
detect_format(" ")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_split_entries_basic():
|
|
32
|
+
text = sample
|
|
33
|
+
entries = split_entries(text)
|
|
34
|
+
assert entries == ["entry one", "entry two", "entry three"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_split_entries_ignores_empty_blocks():
|
|
38
|
+
text = sample + "\n-----\n \n-----\n"
|
|
39
|
+
entries = split_entries(text)
|
|
40
|
+
assert entries == ["entry one", "entry two", "entry three"]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_build_record_id():
|
|
44
|
+
from pathlib import Path
|
|
45
|
+
record_id = build_record_id("ctier", Path("data/corpus/batch.1"), 3)
|
|
46
|
+
assert record_id == "ctier_batch1_003"
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from collections_workflow.src.extraction.gliner_utils import extract_candidates
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_gliner_extract_returns_list():
|
|
5
|
+
text = "Vega Stealer can steal login credentials and credit card credentials from Chrome and Firefox."
|
|
6
|
+
results = extract_candidates(text)
|
|
7
|
+
assert isinstance(results, list)
|
|
8
|
+
if results:
|
|
9
|
+
assert "text" in results[0]
|
|
10
|
+
assert "label" in results[0]
|
|
11
|
+
assert "score" in results[0]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from src.extraction.spacy_utils import detect_lang, has_chinese, has_latin
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_detect_language_en():
|
|
5
|
+
assert detect_lang("Vega Stealer steals login credentials.") == "en"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_detect_language_zh():
|
|
9
|
+
assert detect_lang("窃取登录凭证和信用卡信息") == "zh"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_detect_language_mixed():
|
|
13
|
+
assert detect_lang("Vega Stealer 窃取登录凭证") == "mixed"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_has_chinese():
|
|
17
|
+
assert has_chinese("测试")
|
|
18
|
+
assert not has_chinese("test only")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_has_latin():
|
|
22
|
+
assert has_latin("test")
|
|
23
|
+
assert not has_latin("测试")
|