PyPI - cryptic-cti - Versions diffs - 0.1.0__tar.gz - Mend

cryptic-cti 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

cryptic_cti-0.1.0/PKG-INFO +7 -0
cryptic_cti-0.1.0/README.md +0 -0
cryptic_cti-0.1.0/pyproject.toml +13 -0
cryptic_cti-0.1.0/setup.cfg +4 -0
cryptic_cti-0.1.0/src/__init__.py +0 -0
cryptic_cti-0.1.0/src/classification/__init__.py +0 -0
cryptic_cti-0.1.0/src/classification/utils.py +8 -0
cryptic_cti-0.1.0/src/cryptic_cti.egg-info/PKG-INFO +7 -0
cryptic_cti-0.1.0/src/cryptic_cti.egg-info/SOURCES.txt +25 -0
cryptic_cti-0.1.0/src/cryptic_cti.egg-info/dependency_links.txt +1 -0
cryptic_cti-0.1.0/src/cryptic_cti.egg-info/top_level.txt +8 -0
cryptic_cti-0.1.0/src/extraction/__init__.py +0 -0
cryptic_cti-0.1.0/src/extraction/base.py +6 -0
cryptic_cti-0.1.0/src/extraction/engine.py +12 -0
cryptic_cti-0.1.0/src/extraction/gliner_utils.py +40 -0
cryptic_cti-0.1.0/src/extraction/spacy_utils.py +34 -0
cryptic_cti-0.1.0/src/file_utils.py +34 -0
cryptic_cti-0.1.0/src/models/gliner_model.py +11 -0
cryptic_cti-0.1.0/src/normalization/__init__.py +0 -0
cryptic_cti-0.1.0/src/normalization/utils.py +21 -0
cryptic_cti-0.1.0/src/output/output_objects.py +106 -0
cryptic_cti-0.1.0/src/preprocessing/__init__.py +0 -0
cryptic_cti-0.1.0/src/preprocessing/chunking.py +98 -0
cryptic_cti-0.1.0/tests/test_ctier_parser.py +33 -0
cryptic_cti-0.1.0/tests/test_parser_utils.py +46 -0
cryptic_cti-0.1.0/tests/test_semantex_smoke.py +11 -0
cryptic_cti-0.1.0/tests/test_spacy_utils.py +23 -0

cryptic_cti-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,7 @@
+Metadata-Version: 2.4
+Name: cryptic-cti
+Version: 0.1.0
+Summary: Multilingual CTI collections pipeline for normalizing and structuring cybercrime leads
+Author: Cosmic Octopus
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown

cryptic_cti-0.1.0/README.md ADDED Viewed

File without changes

cryptic_cti-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,13 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "cryptic-cti"
+version = "0.1.0"
+description = "Multilingual CTI collections pipeline for normalizing and structuring cybercrime leads"
+readme = "README.md"
+requires-python = ">=3.9"
+authors = [
+    { name="Cosmic Octopus" }
+]

cryptic_cti-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

cryptic_cti-0.1.0/src/__init__.py ADDED Viewed

File without changes

cryptic_cti-0.1.0/src/classification/__init__.py ADDED Viewed

File without changes

cryptic_cti-0.1.0/src/classification/utils.py ADDED Viewed

@@ -0,0 +1,8 @@
+from __future__ import annotations
+from pathlib import Path
+from setfit import SetFitModel
+def load_model(model_dir: Path) -> SetFitModel:
+    if not model_dir.exists():
+        raise FileNotFoundError(f"SetFit model directory not found in: {model_dir}")
+    return SetFitModel.from_pretrained(str(model_dir), tokenizer_kwargs={"fix_mistral_regex": True})

cryptic_cti-0.1.0/src/cryptic_cti.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,7 @@
+Metadata-Version: 2.4
+Name: cryptic-cti
+Version: 0.1.0
+Summary: Multilingual CTI collections pipeline for normalizing and structuring cybercrime leads
+Author: Cosmic Octopus
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown

cryptic_cti-0.1.0/src/cryptic_cti.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,25 @@
+README.md
+pyproject.toml
+src/__init__.py
+src/file_utils.py
+src/classification/__init__.py
+src/classification/utils.py
+src/cryptic_cti.egg-info/PKG-INFO
+src/cryptic_cti.egg-info/SOURCES.txt
+src/cryptic_cti.egg-info/dependency_links.txt
+src/cryptic_cti.egg-info/top_level.txt
+src/extraction/__init__.py
+src/extraction/base.py
+src/extraction/engine.py
+src/extraction/gliner_utils.py
+src/extraction/spacy_utils.py
+src/models/gliner_model.py
+src/normalization/__init__.py
+src/normalization/utils.py
+src/output/output_objects.py
+src/preprocessing/__init__.py
+src/preprocessing/chunking.py
+tests/test_ctier_parser.py
+tests/test_parser_utils.py
+tests/test_semantex_smoke.py
+tests/test_spacy_utils.py

cryptic_cti-0.1.0/src/cryptic_cti.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

cryptic_cti-0.1.0/src/cryptic_cti.egg-info/top_level.txt ADDED Viewed

@@ -0,0 +1,8 @@
+__init__
+classification
+extraction
+file_utils
+models
+normalization
+output
+preprocessing

cryptic_cti-0.1.0/src/extraction/__init__.py ADDED Viewed

File without changes

cryptic_cti-0.1.0/src/extraction/base.py ADDED Viewed

@@ -0,0 +1,6 @@
+from abc import ABC, abstractmethod
+class ExtractionRunner(ABC):
+    @abstractmethod
+    def extract(self, text: str) -> list[dict]:
+        pass

cryptic_cti-0.1.0/src/extraction/engine.py ADDED Viewed

@@ -0,0 +1,12 @@
+from src.extraction.gliner_utils import GlinerRunner
+from src.extraction.spacy_utils import SpacyRunner
+class ExtractionEngine:
+    def __init__(self):
+        self.runners = {"spacy": SpacyRunner(), "gliner": GlinerRunner()} # future: RegexRunner(), etc.
+    def run(self, text: str) -> dict:
+        results = {}
+        for name, runner in self.runners:
+            name = runner.__class__.__name__.lower()
+            results[name] = runner.extract(text)
+        return results

cryptic_cti-0.1.0/src/extraction/gliner_utils.py ADDED Viewed

@@ -0,0 +1,40 @@
+from __future__ import annotations
+from src.models.gliner_model import get_gliner_model
+from src.extraction.base import ExtractionRunner
+from src.preprocessing.chunking import chunk_block_w_offsets, dedupe_entities
+model_name = "urchade/gliner_medium-v2.1"
+labels = [
+    "malware or tool name",
+    "credential theft activity",
+    "credential or data type",
+    "platform or application",
+    "actor or group name",
+]
+def extract_candidates(text: str) -> list[dict]:
+    model = get_gliner_model()
+    chunks = chunk_block_w_offsets(text)
+    print(f"[extract_candidates] {len(chunks)} chunks | text len={len(text)}")
+    all_entities = []
+    for chunk in chunks:
+        results = model.predict_entities(chunk["text"], labels)
+        for result in results:
+            all_entities.append({
+                "text": result["text"],
+                "label": result["label"],
+                "score": float(result["score"]),
+                "start": chunk["start"] + result["start"],
+                "end": chunk["start"] + result["end"]
+            })
+    return dedupe_entities(all_entities)
+class GlinerRunner(ExtractionRunner):
+    def __init__(self):
+        self.model = get_gliner_model()
+        self.labels = labels
+    def extract(self, text: str) -> list[dict]:
+        return extract_candidates(text)

cryptic_cti-0.1.0/src/extraction/spacy_utils.py ADDED Viewed

@@ -0,0 +1,34 @@
+from __future__ import annotations
+import spacy
+import re
+from collections_workflow.src.extraction.base import ExtractionRunner
+nlp = spacy.load("en_core_web_sm")
+def has_chinese(text: str) -> bool:
+    return bool(re.search(r"[\u4e00-\u9fff]", text))
+def has_latin(text: str) -> bool:
+    return bool(re.search(r"[A-Za-z]", text))
+def detect_lang(text: str) -> str:
+    zh = has_chinese(text)
+    en = has_latin(text)
+    if zh and not en:
+        return "zh"
+    elif en and not zh:
+        return "en"
+    else:
+        return "mixed"
+    return "unknown"
+def spacy_prepare(text: str) -> dict:
+    doc = nlp(text)
+    sentences = list(doc.sents)
+    return {"lang": detect_lang(text), "sentence_count": len(sentences), "token_count": len(doc), "sentences": [sent.text.strip() for sent in sentences if sentences]}
+class SpacyRunner(ExtractionRunner):
+    def extract(self, text: str) -> dict:
+        return spacy_prepare(text)

cryptic_cti-0.1.0/src/file_utils.py ADDED Viewed

@@ -0,0 +1,34 @@
+from pathlib import Path
+import json
+def write_jsonl(path: Path, records: list[dict]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for record in records:
+            json_line = json.dumps(record, ensure_ascii=False)
+            f.write(json_line + "\n")
+def read_jsonl(path: Path) -> list[dict]:
+    rows: list[dict] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line_no, line in enumerate(f, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                rows.append(json.loads(line))
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Invalid JSONL in {path} at line {line_no}: {e}") from e
+    return rows
+def latest_matching_file(directory: Path, pattern: str) -> Path:
+    matches = [p for p in directory.glob(pattern) if p.is_file()]
+    if not matches:
+        raise FileNotFoundError(f"No files found matching {pattern} in {directory}")
+    return max(matches, key=lambda p: p.stat().st_mtime)
+def load_json(path: Path) -> dict:
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)

cryptic_cti-0.1.0/src/models/gliner_model.py ADDED Viewed

@@ -0,0 +1,11 @@
+from gliner import GLiNER
+model_name = "urchade/gliner_medium-v2.1"
+_model = None
+def get_gliner_model():
+    global _model
+    if _model is None:
+        _model = GLiNER.from_pretrained(model_name)
+    return _model

cryptic_cti-0.1.0/src/normalization/__init__.py ADDED Viewed

File without changes

cryptic_cti-0.1.0/src/normalization/utils.py ADDED Viewed

@@ -0,0 +1,21 @@
+from __future__ import annotations
+def normalize_key(value: str) -> str:
+    return " ".join(value.strip().lower().split())
+def normalize_value(value: str, mapping: dict[str, str]) -> tuple[str, bool]:
+    cleaned = value.strip()
+    key = normalize_key(cleaned)
+    if key in mapping:
+        return mapping[key], True
+    return cleaned, False
+def dedupe_preserve_order(values: list) -> list:
+    seen = set()
+    out = []
+    for v in values:
+        if v not in seen:
+            seen.add(v)
+            out.append(v)
+    return out

cryptic_cti-0.1.0/src/output/output_objects.py ADDED Viewed

@@ -0,0 +1,106 @@
+from __future__ import annotations
+from dataclasses import asdict, dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+from uuid import uuid4
+def norm_fieldname(field_name: str) -> str:
+    return field_name.strip().lower()
+def utc_now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+@dataclass(slots=True)
+class Relationship:
+    type: str
+    target_id: str
+    description: str = ""
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+@dataclass(slots=True)
+class Output:
+    id: str = field(default_factory=lambda: str(uuid4()))
+    type: str = ""
+    generated_at: str = field(default_factory=utc_now_iso)
+    producer: str = ""
+    source_ids: list[str] = field(default_factory=list)
+    confidence: int | None = None
+    tlp: str = "TLP:CLEAR"
+    tags: list[str] = field(default_factory=list)
+    relationships: list[Relationship] = field(default_factory=list)
+    summary: str = ""
+    allowed_tlp = {"TLP:CLEAR", "TLP:GREEN", "TLP:AMBER", "TLP:RED"}
+    payload: dict[str, Any] = field(default_factory=dict)
+    def __post_init__(self) -> None:
+        if self.confidence is not None and not (0 <= self.confidence <= 100):
+            raise ValueError("confidence must be between 0 and 100")
+        self.source_ids = self.dedupe("source_ids")
+        self.tags = self.dedupe("tags")
+        if self.tlp not in self.allowed_tlp:
+            raise ValueError(
+                f"tlp must be one of {sorted(self.allowed_tlp)}, got {self.tlp!r}")
+    def dedupe(self, field_name: str) -> list[Any]:
+        field_name = norm_fieldname(field_name)
+        if field_name not in self.__dataclass_fields__:
+            raise ValueError(f"Invalid field name: {field_name}")
+        values = getattr(self, field_name)
+        if not isinstance(values, list):
+            raise ValueError(f"Field {field_name} is not a list, cannot dedupe")
+        out = []
+        for value in values:
+            if value not in out:
+                out.append(value)
+        return out
+    def set_field(self, field_name: str, value: Any) -> None:
+        field_name = norm_fieldname(field_name)
+        if field_name not in self.__dataclass_fields__:
+            raise ValueError(f"Invalid field name: {field_name}")
+        elif value is None:
+            raise ValueError(f"Value for {field_name} cannot be empty")
+        elif field_name in {"source_ids", "tags", "relationships"}:
+            raise TypeError(f"use .add_to() to add values to {field_name}")
+        elif field_name == "confidence" and not (0 <= value <= 100):
+            raise ValueError("confidence must be between 0 and 100")
+        elif field_name == "tlp" and value not in self.allowed_tlp:
+            raise ValueError(f"tlp must be one of {sorted(self.allowed_tlp)}, got {value!r}")
+        else:
+            setattr(self, field_name, value)
+    def add_to(self, field_name: str, value: Any) -> None:
+        field_name = norm_fieldname(field_name)
+        if field_name not in self.__dataclass_fields__:
+            raise ValueError(f"Invalid field name: {field_name}")
+        current_value = getattr(self, field_name)
+        if not isinstance(current_value, list):
+            raise ValueError(f"Field {field_name} is not a list, use .set_field() instead")
+        elif field_name == "relationships":
+            if isinstance(value, list):
+                if not all(isinstance(item, Relationship) for item in value):
+                    raise TypeError(f"All items in {field_name} must be of type Relationship")
+            elif not isinstance(value, Relationship):
+                raise TypeError(f"relationships must be of type Relationship, got {type(value)}")
+        if isinstance(value, list):
+            current_value.extend(value)
+        else:
+            current_value.append(value)
+        current_value[:] = self.dedupe(field_name)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "type": self.type,
+            "generated_at": self.generated_at,
+            "producer": self.producer,
+            "source_ids": self.source_ids,
+            "confidence": self.confidence,
+            "tlp": self.tlp,
+            "tags": self.tags,
+            "relationships": [rel.to_dict() for rel in self.relationships],
+            "summary": self.summary,
+            "payload": self.payload,
+        }

cryptic_cti-0.1.0/src/preprocessing/__init__.py ADDED Viewed

File without changes

cryptic_cti-0.1.0/src/preprocessing/chunking.py ADDED Viewed

@@ -0,0 +1,98 @@
+print(f"starting spacy import...")
+import spacy
+_nlp = None
+def get_nlp():
+    global _nlp
+    if _nlp is None:
+        _nlp = spacy.load("en_core_web_sm")
+    return _nlp
+def sentence_chunks(text: str, target_chars: int = 900, overlap_sentences: int = 1) -> list[str]:
+    nlp = get_nlp()
+    doc = nlp(text)
+    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
+    if not sentences:
+        stripped = text.strip()
+        return [stripped] if stripped else []
+    chunks: list[str] = []
+    current: list[str] = []
+    i = 0
+    while i < len(sentences):
+        current = []
+        current_len = 0
+        start_i = i
+        while i < len(sentences):
+            sentence = sentences[i]
+            projected = current_len + len(sentence) + (1 if current else 0)
+            if current and projected > target_chars:
+                break
+            current.append(sentence)
+            current_len = projected
+            i += 1
+        chunk = " ".join(current).strip()
+        if chunk:
+            chunks.append(chunk)
+        if i >= len(sentences):
+            break
+        i = max(start_i + 1, i - overlap_sentences)
+    return chunks
+def char_chunks(text: str, target_chars: int = 900, overlap_chars: int = 150) -> list[str]:
+    text = text.strip()
+    if not text:
+        return []
+    chunks = []
+    start = 0
+    n = len(text)
+    while start < n:
+        end = min(start + target_chars, n)
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+        if end >= n:
+            break
+        start = max(end - overlap_chars, start + 1)
+    return chunks
+def chunk_block(block: str) -> list[str]:
+    block = block.strip()
+    if not block:
+        return []
+    if len(block) <= 1200 and "\n" not in block:
+        return [block]
+    if block.count(".") > 2:
+        try:
+            return sentence_chunks(block, target_chars=700, overlap_sentences=2)
+        except Exception as e:
+            print(f"Error during sentence chunking: {e}")
+            pass
+    return char_chunks(block, target_chars=700, overlap_chars=200)
+def chunk_block_w_offsets(block: str):
+    chunks = []
+    offset = 0
+    raw_chunks = chunk_block(block)
+    for chunk in raw_chunks:
+        start = block.find(chunk, offset)
+        end = start + len(chunk)
+        chunks.append({
+            "text": chunk,
+            "start": start,
+            "end": end
+        })
+        offset = end
+    return chunks
+def dedupe_entities(entities):
+    seen = set()
+    result = []
+    for e in entities:
+        key = (e["text"], e["start"], e["end"], e["label"])
+        if key not in seen:
+            seen.add(key)
+            result.append(e)
+    return result

cryptic_cti-0.1.0/tests/test_ctier_parser.py ADDED Viewed

@@ -0,0 +1,33 @@
+from pathlib import Path
+from pipeline.metadata_ctier import parse_corpus
+def write_batch(corpus_dir: Path, batch_name: str, entries: list[str]) -> Path:
+    batch_file = corpus_dir / batch_name
+    joined = "\n\n".join(f"-----\n{entry.strip()}" for entry in entries)
+    batch_file.write_text(joined, encoding="utf-8")
+    return batch_file
+def test_batch_read(tmp_path: Path):
+    corpus_dir = tmp_path / "corpus"
+    corpus_dir.mkdir()
+    write_batch(
+        corpus_dir,
+        "batch.1",
+        [
+            '3356: Proofpoint observed a spear-phishing campaign spreading Vega Stealer.',
+            """['11345: "follow.user steals data and credentials"', [['follow.user', [1, 2], 'MW']]]""",
+        ],
+    )
+    records = parse_corpus(corpus_dir=corpus_dir)
+    assert len(records) == 2
+    assert records[0]["id"] == "ctier_batch1_001"
+    assert records[0]["source"] == "ctier"
+    assert records[0]["source_file"] == "batch.1"
+    assert records[0]["entry_index"] == 1
+    assert records[0]["format"] == "text_block"
+    assert records[1]["id"] == "ctier_batch1_002"
+    assert records[1]["format"] == "nested_list"
+    assert records[1]["source_file"] == "batch.1"
+    assert records[1]["entry_index"] == 2

cryptic_cti-0.1.0/tests/test_parser_utils.py ADDED Viewed

@@ -0,0 +1,46 @@
+import pytest
+from pipeline.metadata_ctier import detect_format, split_entries, build_record_id
+sample = """
+-----
+entry one
+-----
+entry two
+-----
+entry three
+"""
+def test_detect_format_nested_list():
+    entry = """['3356: "Vega Stealer..."', [['Vega Stealer', [10, 12], 'MW']]]"""
+    assert detect_format(entry) == "nested_list"
+def test_detect_format_text_block():
+    entry = "3356: Proofpoint observed a spear-phishing campaign spreading Vega Stealer."
+    assert detect_format(entry) == "text_block"
+def test_detect_format_empty_raises():
+    with pytest.raises(ValueError):
+        detect_format("   ")
+def test_split_entries_basic():
+    text = sample
+    entries = split_entries(text)
+    assert entries == ["entry one", "entry two", "entry three"]
+def test_split_entries_ignores_empty_blocks():
+    text = sample + "\n-----\n   \n-----\n"
+    entries = split_entries(text)
+    assert entries == ["entry one", "entry two", "entry three"]
+def test_build_record_id():
+    from pathlib import Path
+    record_id = build_record_id("ctier", Path("data/corpus/batch.1"), 3)
+    assert record_id == "ctier_batch1_003"

cryptic_cti-0.1.0/tests/test_semantex_smoke.py ADDED Viewed

@@ -0,0 +1,11 @@
+from collections_workflow.src.extraction.gliner_utils import extract_candidates
+def test_gliner_extract_returns_list():
+    text = "Vega Stealer can steal login credentials and credit card credentials from Chrome and Firefox."
+    results = extract_candidates(text)
+    assert isinstance(results, list)
+    if results:
+        assert "text" in results[0]
+        assert "label" in results[0]
+        assert "score" in results[0]

cryptic_cti-0.1.0/tests/test_spacy_utils.py ADDED Viewed

@@ -0,0 +1,23 @@
+from src.extraction.spacy_utils import detect_lang, has_chinese, has_latin
+def test_detect_language_en():
+    assert detect_lang("Vega Stealer steals login credentials.") == "en"
+def test_detect_language_zh():
+    assert detect_lang("窃取登录凭证和信用卡信息") == "zh"
+def test_detect_language_mixed():
+    assert detect_lang("Vega Stealer 窃取登录凭证") == "mixed"
+def test_has_chinese():
+    assert has_chinese("测试")
+    assert not has_chinese("test only")
+def test_has_latin():
+    assert has_latin("test")
+    assert not has_latin("测试")