iocflow 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iocflow/__init__.py +63 -0
- iocflow/allowlists.py +177 -0
- iocflow/cli.py +62 -0
- iocflow/extract.py +99 -0
- iocflow/extractors/__init__.py +35 -0
- iocflow/extractors/actors.py +107 -0
- iocflow/extractors/contacts.py +15 -0
- iocflow/extractors/files.py +84 -0
- iocflow/extractors/network.py +150 -0
- iocflow/extractors/vulns.py +37 -0
- iocflow/mitre.py +133 -0
- iocflow/models.py +157 -0
- iocflow/providers.py +91 -0
- iocflow/refang.py +27 -0
- iocflow-0.1.0.dist-info/METADATA +181 -0
- iocflow-0.1.0.dist-info/RECORD +19 -0
- iocflow-0.1.0.dist-info/WHEEL +4 -0
- iocflow-0.1.0.dist-info/entry_points.txt +2 -0
- iocflow-0.1.0.dist-info/licenses/LICENSE +21 -0
iocflow/__init__.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""iocflow — an IOC-lifecycle toolkit.
|
|
2
|
+
|
|
3
|
+
Layer 1 is threat-entity extraction: pull IPs, domains, URLs, filenames,
|
|
4
|
+
hashes, CVEs, emails, MITRE technique IDs, threat actors, and malware families
|
|
5
|
+
out of unstructured text. The extracted :class:`ExtractedEntities` is the input
|
|
6
|
+
type that later layers (enrichment, AI commentary, hunt generation, blocking)
|
|
7
|
+
build on.
|
|
8
|
+
|
|
9
|
+
Quick start::
|
|
10
|
+
|
|
11
|
+
from iocflow import extract
|
|
12
|
+
|
|
13
|
+
entities = extract("APT28 used 185.220.101.5 and evil[.]example[.]com")
|
|
14
|
+
print(entities.summary())
|
|
15
|
+
for indicator in entities.iter_indicators():
|
|
16
|
+
print(indicator.kind, indicator.value)
|
|
17
|
+
"""
|
|
18
|
+
from iocflow.extract import extract
|
|
19
|
+
from iocflow.extractors import (
|
|
20
|
+
extract_cves,
|
|
21
|
+
extract_domains,
|
|
22
|
+
extract_emails,
|
|
23
|
+
extract_filenames,
|
|
24
|
+
extract_hashes,
|
|
25
|
+
extract_ips,
|
|
26
|
+
extract_malware_families,
|
|
27
|
+
extract_mitre_procedures,
|
|
28
|
+
extract_mitre_techniques,
|
|
29
|
+
extract_threat_actors,
|
|
30
|
+
extract_urls,
|
|
31
|
+
)
|
|
32
|
+
from iocflow.models import ExtractedEntities, Indicator, ThreatActor
|
|
33
|
+
from iocflow.providers import ActorAliases, MalwareNames
|
|
34
|
+
from iocflow.refang import refang_text
|
|
35
|
+
|
|
36
|
+
__version__ = "0.1.0"
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
# Orchestrator
|
|
40
|
+
"extract",
|
|
41
|
+
# Result types
|
|
42
|
+
"ExtractedEntities",
|
|
43
|
+
"Indicator",
|
|
44
|
+
"ThreatActor",
|
|
45
|
+
# Pluggable sources
|
|
46
|
+
"ActorAliases",
|
|
47
|
+
"MalwareNames",
|
|
48
|
+
# Individual extractors
|
|
49
|
+
"extract_ips",
|
|
50
|
+
"extract_domains",
|
|
51
|
+
"extract_urls",
|
|
52
|
+
"extract_emails",
|
|
53
|
+
"extract_filenames",
|
|
54
|
+
"extract_hashes",
|
|
55
|
+
"extract_cves",
|
|
56
|
+
"extract_mitre_techniques",
|
|
57
|
+
"extract_mitre_procedures",
|
|
58
|
+
"extract_threat_actors",
|
|
59
|
+
"extract_malware_families",
|
|
60
|
+
# Utilities
|
|
61
|
+
"refang_text",
|
|
62
|
+
"__version__",
|
|
63
|
+
]
|
iocflow/allowlists.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Allowlists and blocklists used to suppress false positives.
|
|
2
|
+
|
|
3
|
+
These are deliberately exported as plain module-level sets so callers can
|
|
4
|
+
extend or override them, e.g.::
|
|
5
|
+
|
|
6
|
+
from iocflow import allowlists
|
|
7
|
+
allowlists.BENIGN_DOMAINS.add("corp.example.net")
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
# Benign domains to exclude during extraction. The IOC-hunt flow bypasses
|
|
12
|
+
# reputation filtering, so this list is intentionally broad: test/placeholder
|
|
13
|
+
# domains, package registries, security-vendor sites, threat-intel references,
|
|
14
|
+
# and major cloud/CDN infrastructure that routinely appears in reports.
|
|
15
|
+
BENIGN_DOMAINS = {
|
|
16
|
+
# Test / placeholder domains
|
|
17
|
+
"example.com", "example.org", "example.net", "localhost", "test.com",
|
|
18
|
+
"internal.local",
|
|
19
|
+
# Common email providers (email addresses still extracted, just not as domain IOCs)
|
|
20
|
+
"gmail.com", "hotmail.com", "yahoo.com", "outlook.com", "live.com",
|
|
21
|
+
# Package registries — legitimate infrastructure, not IOCs
|
|
22
|
+
"npmjs.org", "registry.npmjs.org", "yarn.npmjs.org",
|
|
23
|
+
"yarnpkg.com", "registry.yarnpkg.com",
|
|
24
|
+
"github.com", "raw.githubusercontent.com", "gist.github.com",
|
|
25
|
+
"pypi.org", "files.pythonhosted.org",
|
|
26
|
+
"rubygems.org", "nuget.org", "crates.io",
|
|
27
|
+
"packagist.org", "mvnrepository.com", "maven.org",
|
|
28
|
+
"docker.io", "docker.com", "hub.docker.com",
|
|
29
|
+
# Cybersecurity vendors — appear as references in reports, not IOCs
|
|
30
|
+
"paloaltonetworks.com", "unit42.paloaltonetworks.com",
|
|
31
|
+
"crowdstrike.com", "falcon.crowdstrike.com",
|
|
32
|
+
"mandiant.com", "cloud.google.com",
|
|
33
|
+
"microsoft.com", "learn.microsoft.com", "security.microsoft.com",
|
|
34
|
+
"cisco.com", "talosintelligence.com",
|
|
35
|
+
"fortinet.com", "fortiguard.com",
|
|
36
|
+
"sentinelone.com", "sentinellabs.com",
|
|
37
|
+
"trendmicro.com",
|
|
38
|
+
"sophos.com", "news.sophos.com",
|
|
39
|
+
"symantec.com", "broadcom.com",
|
|
40
|
+
"mcafee.com", "trellix.com",
|
|
41
|
+
"fireeye.com",
|
|
42
|
+
"elastic.co", "elastic.github.io",
|
|
43
|
+
"zscaler.com",
|
|
44
|
+
"proofpoint.com",
|
|
45
|
+
"checkpoint.com", "research.checkpoint.com",
|
|
46
|
+
"recordedfuture.com",
|
|
47
|
+
"sekoia.io",
|
|
48
|
+
"group-ib.com",
|
|
49
|
+
"kaspersky.com", "securelist.com",
|
|
50
|
+
"eset.com", "welivesecurity.com",
|
|
51
|
+
"bitdefender.com",
|
|
52
|
+
"malwarebytes.com",
|
|
53
|
+
"cybereason.com",
|
|
54
|
+
"rapid7.com",
|
|
55
|
+
"qualys.com",
|
|
56
|
+
"tenable.com",
|
|
57
|
+
"dragos.com",
|
|
58
|
+
"volexity.com",
|
|
59
|
+
"huntress.com",
|
|
60
|
+
"infoblox.com",
|
|
61
|
+
# Threat-intel / research references
|
|
62
|
+
"mitre.org", "attack.mitre.org", "cve.mitre.org",
|
|
63
|
+
"krebsonsecurity.com",
|
|
64
|
+
"bleepingcomputer.com",
|
|
65
|
+
"thehackernews.com",
|
|
66
|
+
"therecord.media",
|
|
67
|
+
"darkreading.com",
|
|
68
|
+
"securityweek.com",
|
|
69
|
+
"threatpost.com",
|
|
70
|
+
"cyberscoop.com",
|
|
71
|
+
"schneier.com",
|
|
72
|
+
"nist.gov", "nvd.nist.gov",
|
|
73
|
+
"cisa.gov", "us-cert.cisa.gov",
|
|
74
|
+
"cert.org",
|
|
75
|
+
"virustotal.com",
|
|
76
|
+
"shodan.io",
|
|
77
|
+
"abuse.ch", "bazaar.abuse.ch", "urlhaus.abuse.ch", "threatfox.abuse.ch",
|
|
78
|
+
"otx.alienvault.com", "alienvault.com",
|
|
79
|
+
"hybrid-analysis.com",
|
|
80
|
+
"any.run", "app.any.run",
|
|
81
|
+
"joesandbox.com", "joesecurity.org",
|
|
82
|
+
"urlscan.io",
|
|
83
|
+
"whois.domaintools.com", "domaintools.com",
|
|
84
|
+
"abuseipdb.com",
|
|
85
|
+
# Cloud / CDN infrastructure
|
|
86
|
+
"amazonaws.com", "azure.com", "azureedge.net",
|
|
87
|
+
"cloudflare.com", "cloudfront.net",
|
|
88
|
+
"akamai.com", "akamaitechnologies.com",
|
|
89
|
+
"googleapis.com",
|
|
90
|
+
"windows.net", "office365.com", "office.com",
|
|
91
|
+
"sharepoint.com", "onedrive.com",
|
|
92
|
+
"google.com", "gstatic.com",
|
|
93
|
+
"linkedin.com", "twitter.com", "x.com",
|
|
94
|
+
"wikipedia.org", "medium.com",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
# Package-registry hosts: the host is benign infrastructure, but a *path* under
|
|
98
|
+
# it (registry.npmjs.org/<pkg>) can name a malicious package, so URL paths on
|
|
99
|
+
# these hosts are kept even though the bare host is in BENIGN_DOMAINS.
|
|
100
|
+
PACKAGE_REGISTRY_HOSTS = {
|
|
101
|
+
"npmjs.org", "registry.npmjs.org", "yarn.npmjs.org",
|
|
102
|
+
"yarnpkg.com", "registry.yarnpkg.com",
|
|
103
|
+
"pypi.org", "files.pythonhosted.org",
|
|
104
|
+
"rubygems.org", "nuget.org", "crates.io",
|
|
105
|
+
"packagist.org", "mvnrepository.com", "maven.org",
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
# Known benign IPs to exclude (loopback, broadcast, public resolvers).
|
|
109
|
+
BENIGN_IPS = {
|
|
110
|
+
"127.0.0.1", "0.0.0.0", "255.255.255.255",
|
|
111
|
+
"8.8.8.8", "8.8.4.4", # Google DNS
|
|
112
|
+
"1.1.1.1", "1.0.0.1", # Cloudflare DNS
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Extensions that are also valid TLDs. A bare "word.ext" with one of these is
|
|
116
|
+
# usually a filename (install.sh) rather than a domain (openclaw.ai), so it is
|
|
117
|
+
# only accepted as a domain when it doesn't look like a common filename.
|
|
118
|
+
FILE_EXTENSION_TLDS = {
|
|
119
|
+
"sh", "py", "pl", "rs", "ps", "cc", "md", "so", "la", "do", "to",
|
|
120
|
+
"ai", "st", "fm", "am", "dj", "gs", "ms", "lk", "im", "ws", "nu", "tk",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
# Common filenames that collide with file-extension TLDs (install.sh, setup.py).
|
|
124
|
+
COMMON_FILENAME_STEMS = {
|
|
125
|
+
"install", "setup", "script", "run", "start", "init",
|
|
126
|
+
"main", "index", "test", "build", "deploy", "config",
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
# Common English words that happen to be MITRE malware/tool names.
|
|
130
|
+
MALWARE_BLOCKLIST = {
|
|
131
|
+
"anchor", "chaos", "empire", "expand", "flame", "havoc",
|
|
132
|
+
"james", "kevin", "milan", "mango", "meteor", "net", "ninja",
|
|
133
|
+
"ping", "rover", "royal", "ruler", "shark", "snake", "spark",
|
|
134
|
+
"solar", "page", "play",
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# LOLBins — legitimate Windows/system utilities MITRE tracks as "tools".
|
|
138
|
+
SYSTEM_TOOL_BLOCKLIST = {
|
|
139
|
+
"arp", "at", "attrib", "bitsadmin", "certutil", "cipher.exe",
|
|
140
|
+
"cmd", "dsquery", "esentutl", "forfiles", "ftp", "ifconfig",
|
|
141
|
+
"ipconfig", "nbtstat", "nbtscan", "net", "netsh", "netstat",
|
|
142
|
+
"nltest", "ping", "psexec", "pwdump", "rclone", "reg", "route",
|
|
143
|
+
"schtasks", "sdelete", "systeminfo", "tasklist", "tor", "wevtutil",
|
|
144
|
+
"connectwise", "quick assist",
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
# Well-known threat-actor and ransomware names matched by exact word boundary.
|
|
148
|
+
WELL_KNOWN_ACTORS = [
|
|
149
|
+
# APT groups
|
|
150
|
+
"Lazarus", "Lazarus Group",
|
|
151
|
+
"Fancy Bear", "Cozy Bear",
|
|
152
|
+
"Sandworm", "Turla",
|
|
153
|
+
"Kimsuky", "Charming Kitten",
|
|
154
|
+
"OceanLotus", "Ocean Lotus",
|
|
155
|
+
"Equation Group",
|
|
156
|
+
"Scattered Spider",
|
|
157
|
+
"Nobelium", "Midnight Blizzard",
|
|
158
|
+
"Volt Typhoon", "Salt Typhoon",
|
|
159
|
+
# Ransomware families
|
|
160
|
+
"ALPHV", "BlackCat",
|
|
161
|
+
"LockBit", "Conti", "REvil",
|
|
162
|
+
"CrazyHunter", "Akira", "Play",
|
|
163
|
+
"Royal", "Black Basta", "BlackBasta",
|
|
164
|
+
"Cl0p", "Clop", "Cuba", "Hive",
|
|
165
|
+
"Medusa", "Rhysida", "BianLian",
|
|
166
|
+
"NoEscape", "Cactus", "Hunters International",
|
|
167
|
+
"Qilin", "INC Ransom", "RansomHub",
|
|
168
|
+
"DragonForce", "Fog", "Lynx",
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
# Capitalized words that precede "ransomware" but are not actor names.
|
|
172
|
+
RANSOMWARE_FALSE_POSITIVES = {
|
|
173
|
+
"The", "This", "That", "New", "Old", "Some", "Any", "Each", "Our", "Their",
|
|
174
|
+
"Executed", "Propagated", "Deployed", "Distributed", "Disguised", "Downloaded",
|
|
175
|
+
"Encrypted", "Delivered", "Launched", "Installed", "Targeted", "Modified",
|
|
176
|
+
"Prince", # Usually "fork of Prince ransomware" — context, not actor
|
|
177
|
+
}
|
iocflow/cli.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Command-line entry point: ``python -m iocflow`` (or the ``iocflow`` script)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from iocflow.extract import extract
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main(argv=None) -> int:
|
|
12
|
+
parser = argparse.ArgumentParser(
|
|
13
|
+
prog="iocflow",
|
|
14
|
+
description="Extract threat indicators (IOCs) from text.",
|
|
15
|
+
)
|
|
16
|
+
parser.add_argument(
|
|
17
|
+
"text",
|
|
18
|
+
nargs="*",
|
|
19
|
+
help="Text to extract from. If omitted, reads from stdin.",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"--json",
|
|
23
|
+
action="store_true",
|
|
24
|
+
help="Emit the full result as JSON instead of a human summary.",
|
|
25
|
+
)
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"--no-refang",
|
|
28
|
+
action="store_true",
|
|
29
|
+
help="Do not re-fang defanged IOCs before extracting.",
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--mitre",
|
|
33
|
+
action="store_true",
|
|
34
|
+
help="Load MITRE malware names for family extraction (needs iocflow[mitre]).",
|
|
35
|
+
)
|
|
36
|
+
args = parser.parse_args(argv)
|
|
37
|
+
|
|
38
|
+
text = " ".join(args.text) if args.text else sys.stdin.read()
|
|
39
|
+
|
|
40
|
+
malware_names = None
|
|
41
|
+
if args.mitre:
|
|
42
|
+
try:
|
|
43
|
+
from iocflow.mitre import mitre_malware_names
|
|
44
|
+
|
|
45
|
+
malware_names = mitre_malware_names()
|
|
46
|
+
except ImportError:
|
|
47
|
+
parser.error("--mitre requires the extra: pip install 'iocflow[mitre]'")
|
|
48
|
+
|
|
49
|
+
entities = extract(text, malware_names=malware_names, refang=not args.no_refang)
|
|
50
|
+
|
|
51
|
+
if args.json:
|
|
52
|
+
json.dump(entities.to_dict(), sys.stdout, indent=2)
|
|
53
|
+
sys.stdout.write("\n")
|
|
54
|
+
else:
|
|
55
|
+
print(entities.summary())
|
|
56
|
+
for indicator in entities.iter_indicators():
|
|
57
|
+
print(f" {indicator.kind:16} {indicator.value}")
|
|
58
|
+
return 0
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
if __name__ == "__main__":
|
|
62
|
+
raise SystemExit(main())
|
iocflow/extract.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""The :func:`extract` orchestrator — run every extractor over a piece of text."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from iocflow.extractors.actors import extract_malware_families, extract_threat_actors
|
|
9
|
+
from iocflow.extractors.contacts import extract_emails
|
|
10
|
+
from iocflow.extractors.files import extract_filenames, extract_hashes
|
|
11
|
+
from iocflow.extractors.network import extract_domains, extract_ips, extract_urls
|
|
12
|
+
from iocflow.extractors.vulns import extract_cves, extract_mitre_techniques
|
|
13
|
+
from iocflow.models import ExtractedEntities, ThreatActor
|
|
14
|
+
from iocflow.providers import ActorAliases, MalwareNames
|
|
15
|
+
from iocflow.refang import refang_text
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
_HTML_TAG = re.compile(r"<[^>]+>")
|
|
20
|
+
_WHITESPACE = re.compile(r"\s+")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def extract(
|
|
24
|
+
text: str,
|
|
25
|
+
*,
|
|
26
|
+
actor_aliases: Optional[ActorAliases] = None,
|
|
27
|
+
malware_names: Optional[MalwareNames] = None,
|
|
28
|
+
refang: bool = True,
|
|
29
|
+
) -> ExtractedEntities:
|
|
30
|
+
"""Extract all entity types from ``text``.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
text: The text to extract from.
|
|
34
|
+
actor_aliases: Optional known-actor names + alias index. When given,
|
|
35
|
+
actor names are also matched against this set and enriched with
|
|
36
|
+
``common_name`` / ``region`` / ``all_names``.
|
|
37
|
+
malware_names: Optional malware/tool name set. Required for
|
|
38
|
+
``malware_families`` to be populated.
|
|
39
|
+
refang: Whether to re-fang defanged IOCs (``[.]`` -> ``.`` etc.) first.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
An :class:`~iocflow.models.ExtractedEntities`.
|
|
43
|
+
"""
|
|
44
|
+
if not text:
|
|
45
|
+
return ExtractedEntities()
|
|
46
|
+
|
|
47
|
+
# Strip HTML and collapse whitespace.
|
|
48
|
+
clean = _HTML_TAG.sub(" ", text)
|
|
49
|
+
clean = _WHITESPACE.sub(" ", clean)
|
|
50
|
+
if refang:
|
|
51
|
+
clean = refang_text(clean)
|
|
52
|
+
|
|
53
|
+
raw_actors = extract_threat_actors(clean, actor_aliases)
|
|
54
|
+
enriched = _enrich_actors(raw_actors, actor_aliases)
|
|
55
|
+
|
|
56
|
+
urls = extract_urls(clean)
|
|
57
|
+
entities = ExtractedEntities(
|
|
58
|
+
ips=extract_ips(clean),
|
|
59
|
+
domains=extract_domains(clean),
|
|
60
|
+
urls=urls,
|
|
61
|
+
filenames=extract_filenames(clean, urls=urls),
|
|
62
|
+
hashes=extract_hashes(clean),
|
|
63
|
+
cves=extract_cves(clean),
|
|
64
|
+
emails=extract_emails(clean),
|
|
65
|
+
threat_actors=raw_actors,
|
|
66
|
+
threat_actors_enriched=enriched,
|
|
67
|
+
malware_families=extract_malware_families(clean, malware_names),
|
|
68
|
+
mitre_techniques=extract_mitre_techniques(clean),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
if not entities.is_empty():
|
|
72
|
+
logger.info("Extracted entities: %s", entities.summary())
|
|
73
|
+
return entities
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _enrich_actors(
|
|
77
|
+
raw_actors: list, actor_aliases: Optional[ActorAliases]
|
|
78
|
+
) -> list:
|
|
79
|
+
"""Map raw actor names to :class:`ThreatActor`, de-duplicating by common name."""
|
|
80
|
+
enriched = []
|
|
81
|
+
seen_common = set()
|
|
82
|
+
for name in raw_actors:
|
|
83
|
+
info = actor_aliases.lookup(name) if actor_aliases else None
|
|
84
|
+
if info:
|
|
85
|
+
common = info.get("common_name", name)
|
|
86
|
+
if common.lower() in seen_common:
|
|
87
|
+
continue
|
|
88
|
+
seen_common.add(common.lower())
|
|
89
|
+
enriched.append(
|
|
90
|
+
ThreatActor(
|
|
91
|
+
name=name,
|
|
92
|
+
common_name=common,
|
|
93
|
+
region=info.get("region", ""),
|
|
94
|
+
all_names=info.get("all_names", []),
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
else:
|
|
98
|
+
enriched.append(ThreatActor(name=name, common_name=name))
|
|
99
|
+
return enriched
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Individual, composable extraction functions.
|
|
2
|
+
|
|
3
|
+
Each function takes text and returns a list/dict of one entity type. They are
|
|
4
|
+
re-exported from the top-level :mod:`iocflow` package for convenience.
|
|
5
|
+
"""
|
|
6
|
+
from iocflow.extractors.actors import (
|
|
7
|
+
extract_malware_families,
|
|
8
|
+
extract_threat_actors,
|
|
9
|
+
)
|
|
10
|
+
from iocflow.extractors.contacts import extract_emails
|
|
11
|
+
from iocflow.extractors.files import extract_filenames, extract_hashes
|
|
12
|
+
from iocflow.extractors.network import (
|
|
13
|
+
extract_domains,
|
|
14
|
+
extract_ips,
|
|
15
|
+
extract_urls,
|
|
16
|
+
)
|
|
17
|
+
from iocflow.extractors.vulns import (
|
|
18
|
+
extract_cves,
|
|
19
|
+
extract_mitre_procedures,
|
|
20
|
+
extract_mitre_techniques,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"extract_ips",
|
|
25
|
+
"extract_domains",
|
|
26
|
+
"extract_urls",
|
|
27
|
+
"extract_emails",
|
|
28
|
+
"extract_filenames",
|
|
29
|
+
"extract_hashes",
|
|
30
|
+
"extract_cves",
|
|
31
|
+
"extract_mitre_techniques",
|
|
32
|
+
"extract_mitre_procedures",
|
|
33
|
+
"extract_threat_actors",
|
|
34
|
+
"extract_malware_families",
|
|
35
|
+
]
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Threat-actor and malware-family extraction.
|
|
2
|
+
|
|
3
|
+
Actor extraction works with zero external data via patterns and a curated
|
|
4
|
+
well-known list. Pass an :class:`~iocflow.providers.ActorAliases` to also match
|
|
5
|
+
a custom name set, and a :class:`~iocflow.providers.MalwareNames` to enable
|
|
6
|
+
malware-family extraction.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from iocflow.allowlists import (
|
|
14
|
+
MALWARE_BLOCKLIST,
|
|
15
|
+
RANSOMWARE_FALSE_POSITIVES,
|
|
16
|
+
SYSTEM_TOOL_BLOCKLIST,
|
|
17
|
+
WELL_KNOWN_ACTORS,
|
|
18
|
+
)
|
|
19
|
+
from iocflow.providers import ActorAliases, MalwareNames
|
|
20
|
+
|
|
21
|
+
# Structured actor designators: APT28, APT-28, UNC2452, FIN7, TA505, DEV-0537,
|
|
22
|
+
# STORM-0558.
|
|
23
|
+
_ACTOR_PATTERNS = [
|
|
24
|
+
re.compile(r"\bAPT[-]?\d+\b", re.IGNORECASE),
|
|
25
|
+
re.compile(r"\bUNC\d+\b", re.IGNORECASE),
|
|
26
|
+
re.compile(r"\bFIN\d+\b", re.IGNORECASE),
|
|
27
|
+
re.compile(r"\bTA\d+\b", re.IGNORECASE),
|
|
28
|
+
re.compile(r"\bDEV-\d+\b", re.IGNORECASE),
|
|
29
|
+
re.compile(r"\bSTORM-\d+\b", re.IGNORECASE),
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# "<ProperNoun> ransomware" — case-sensitive so "go-based ransomware" misses.
|
|
33
|
+
_RANSOMWARE = re.compile(r"\b([A-Z][a-z]+(?:[A-Z][a-z0-9]*)*)\s+ransomware\b")
|
|
34
|
+
|
|
35
|
+
# Real APT names that are common English words and cause too many false hits.
|
|
36
|
+
_ACTOR_NAME_BLOCKLIST = {"lead"}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def extract_threat_actors(text: str, known: Optional[ActorAliases] = None) -> List[str]:
|
|
40
|
+
"""Extract threat-actor names.
|
|
41
|
+
|
|
42
|
+
Strategies, in order:
|
|
43
|
+
|
|
44
|
+
1. Exact-match a caller-supplied known-name set (if ``known`` is given).
|
|
45
|
+
2. Match APT/UNC/FIN/TA/DEV/STORM designators.
|
|
46
|
+
3. Match a curated list of well-known actor and ransomware names.
|
|
47
|
+
4. Catch the ``"<Name> ransomware"`` pattern.
|
|
48
|
+
"""
|
|
49
|
+
actors = set()
|
|
50
|
+
|
|
51
|
+
if known is not None:
|
|
52
|
+
for name in known.known_names:
|
|
53
|
+
if len(name) <= 3 or name.lower() in _ACTOR_NAME_BLOCKLIST:
|
|
54
|
+
continue
|
|
55
|
+
m = re.search(r"\b" + re.escape(name) + r"\b", text, re.IGNORECASE)
|
|
56
|
+
if m:
|
|
57
|
+
actors.add(m.group())
|
|
58
|
+
|
|
59
|
+
for pattern in _ACTOR_PATTERNS:
|
|
60
|
+
actors.update(m.upper() for m in pattern.findall(text))
|
|
61
|
+
|
|
62
|
+
for actor in WELL_KNOWN_ACTORS:
|
|
63
|
+
m = re.search(r"\b" + re.escape(actor) + r"\b", text, re.IGNORECASE)
|
|
64
|
+
if m:
|
|
65
|
+
actors.add(m.group())
|
|
66
|
+
|
|
67
|
+
for m in _RANSOMWARE.finditer(text):
|
|
68
|
+
name = m.group(1)
|
|
69
|
+
if name not in RANSOMWARE_FALSE_POSITIVES and len(name) >= 4:
|
|
70
|
+
actors.add(name)
|
|
71
|
+
|
|
72
|
+
return list(actors)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def extract_malware_families(text: str, malware: Optional[MalwareNames] = None) -> List[str]:
|
|
76
|
+
"""Extract malware-family names by matching a caller-supplied name set.
|
|
77
|
+
|
|
78
|
+
Returns ``[]`` when no :class:`~iocflow.providers.MalwareNames` is given.
|
|
79
|
+
Three-layer false-positive defense:
|
|
80
|
+
|
|
81
|
+
1. Skip names of 3 characters or fewer (``at``, ``cmd``, ``ftp``).
|
|
82
|
+
2. Case-sensitive matching for single-word names; case-insensitive for
|
|
83
|
+
multi-word names.
|
|
84
|
+
3. Blocklist common English words and LOLBins.
|
|
85
|
+
|
|
86
|
+
When an alias matches, the result is normalized to its canonical name.
|
|
87
|
+
"""
|
|
88
|
+
if malware is None or not malware.names:
|
|
89
|
+
return []
|
|
90
|
+
|
|
91
|
+
matched = set()
|
|
92
|
+
for name in malware.names:
|
|
93
|
+
if len(name) <= 3:
|
|
94
|
+
continue
|
|
95
|
+
name_lower = name.lower()
|
|
96
|
+
if name_lower in MALWARE_BLOCKLIST or name_lower in SYSTEM_TOOL_BLOCKLIST:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
flags = re.IGNORECASE if " " in name else 0
|
|
100
|
+
if re.search(r"\b" + re.escape(name) + r"\b", text, flags):
|
|
101
|
+
canonical = malware.alias_map.get(name_lower, name)
|
|
102
|
+
canon_lower = canonical.lower()
|
|
103
|
+
if canon_lower in MALWARE_BLOCKLIST or canon_lower in SYSTEM_TOOL_BLOCKLIST:
|
|
104
|
+
continue
|
|
105
|
+
matched.add(canonical)
|
|
106
|
+
|
|
107
|
+
return sorted(matched)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Contact indicators: email addresses."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
_EMAIL = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b", re.IGNORECASE)
|
|
8
|
+
|
|
9
|
+
MAX_EMAILS = 20
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def extract_emails(text: str) -> List[str]:
|
|
13
|
+
"""Extract email addresses, lowercased and de-duplicated."""
|
|
14
|
+
matches = _EMAIL.findall(text)
|
|
15
|
+
return list(dict.fromkeys(e.lower() for e in matches))[:MAX_EMAILS]
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""File indicators: suspicious filenames and cryptographic hashes."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
# Extensions that indicate a potentially malicious file. ``.com`` is excluded
|
|
8
|
+
# because it collides with domain names (github.com).
|
|
9
|
+
SUSPICIOUS_EXTENSIONS = {
|
|
10
|
+
# Scripts
|
|
11
|
+
"ps1", "sh", "bat", "cmd", "vbs", "vbe", "js", "jse", "wsf", "wsh",
|
|
12
|
+
# Executables (no .com — avoids domain false positives)
|
|
13
|
+
"exe", "dll", "msi", "scr", "pif",
|
|
14
|
+
# Documents with macros
|
|
15
|
+
"docm", "xlsm", "pptm", "dotm", "xltm",
|
|
16
|
+
# Archives (can carry malware)
|
|
17
|
+
"iso", "img", "vhd", "vhdx",
|
|
18
|
+
# Other
|
|
19
|
+
"hta", "lnk", "jar", "msc",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
_EXT_ALTERNATION = "|".join(re.escape(ext) for ext in SUSPICIOUS_EXTENSIONS)
|
|
23
|
+
_FILENAME = re.compile(rf"\b([a-zA-Z0-9_\-\.]+\.(?:{_EXT_ALTERNATION}))\b", re.IGNORECASE)
|
|
24
|
+
|
|
25
|
+
MAX_FILENAMES = 20
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def extract_filenames(text: str, urls: Optional[List[str]] = None) -> List[str]:
|
|
29
|
+
"""Extract suspicious script/executable filenames from text and URLs."""
|
|
30
|
+
filenames: List[str] = []
|
|
31
|
+
seen = set()
|
|
32
|
+
|
|
33
|
+
for match in _FILENAME.finditer(text):
|
|
34
|
+
filename = match.group(1)
|
|
35
|
+
if filename.lower() not in seen:
|
|
36
|
+
filenames.append(filename)
|
|
37
|
+
seen.add(filename.lower())
|
|
38
|
+
|
|
39
|
+
if urls:
|
|
40
|
+
for url in urls:
|
|
41
|
+
path = url.split("/")[-1]
|
|
42
|
+
if path and "." in path:
|
|
43
|
+
ext = path.rsplit(".", 1)[-1].lower()
|
|
44
|
+
if ext in SUSPICIOUS_EXTENSIONS and path.lower() not in seen:
|
|
45
|
+
filenames.append(path)
|
|
46
|
+
seen.add(path.lower())
|
|
47
|
+
|
|
48
|
+
return filenames[:MAX_FILENAMES]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
_SHA256 = re.compile(r"\b[a-fA-F0-9]{64}\b")
|
|
52
|
+
_SHA1 = re.compile(r"\b[a-fA-F0-9]{40}\b")
|
|
53
|
+
_MD5 = re.compile(r"\b[a-fA-F0-9]{32}\b")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def extract_hashes(text: str) -> Dict[str, List[str]]:
|
|
57
|
+
"""Extract MD5, SHA1, and SHA256 hashes.
|
|
58
|
+
|
|
59
|
+
Longer hashes are matched first; shorter patterns that are merely a prefix
|
|
60
|
+
of an already-matched longer hash are dropped, so a SHA256 isn't also
|
|
61
|
+
reported as a SHA1 and an MD5.
|
|
62
|
+
"""
|
|
63
|
+
hashes: Dict[str, List[str]] = {"md5": [], "sha1": [], "sha256": []}
|
|
64
|
+
|
|
65
|
+
hashes["sha256"] = list(dict.fromkeys(h.lower() for h in _SHA256.findall(text)))
|
|
66
|
+
|
|
67
|
+
sha256_prefixes_40 = {h[:40] for h in hashes["sha256"]}
|
|
68
|
+
hashes["sha1"] = list(
|
|
69
|
+
dict.fromkeys(
|
|
70
|
+
h.lower() for h in _SHA1.findall(text) if h.lower() not in sha256_prefixes_40
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
sha1_prefixes_32 = {h[:32] for h in hashes["sha1"]}
|
|
75
|
+
sha256_prefixes_32 = {h[:32] for h in hashes["sha256"]}
|
|
76
|
+
hashes["md5"] = list(
|
|
77
|
+
dict.fromkeys(
|
|
78
|
+
h.lower()
|
|
79
|
+
for h in _MD5.findall(text)
|
|
80
|
+
if h.lower() not in sha1_prefixes_32 and h.lower() not in sha256_prefixes_32
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return hashes
|