iocflow 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
iocflow/__init__.py ADDED
@@ -0,0 +1,63 @@
1
+ """iocflow — an IOC-lifecycle toolkit.
2
+
3
+ Layer 1 is threat-entity extraction: pull IPs, domains, URLs, filenames,
4
+ hashes, CVEs, emails, MITRE technique IDs, threat actors, and malware families
5
+ out of unstructured text. The extracted :class:`ExtractedEntities` is the input
6
+ type that later layers (enrichment, AI commentary, hunt generation, blocking)
7
+ build on.
8
+
9
+ Quick start::
10
+
11
+ from iocflow import extract
12
+
13
+ entities = extract("APT28 used 185.220.101.5 and evil[.]example[.]com")
14
+ print(entities.summary())
15
+ for indicator in entities.iter_indicators():
16
+ print(indicator.kind, indicator.value)
17
+ """
18
+ from iocflow.extract import extract
19
+ from iocflow.extractors import (
20
+ extract_cves,
21
+ extract_domains,
22
+ extract_emails,
23
+ extract_filenames,
24
+ extract_hashes,
25
+ extract_ips,
26
+ extract_malware_families,
27
+ extract_mitre_procedures,
28
+ extract_mitre_techniques,
29
+ extract_threat_actors,
30
+ extract_urls,
31
+ )
32
+ from iocflow.models import ExtractedEntities, Indicator, ThreatActor
33
+ from iocflow.providers import ActorAliases, MalwareNames
34
+ from iocflow.refang import refang_text
35
+
36
+ __version__ = "0.1.0"
37
+
38
+ __all__ = [
39
+ # Orchestrator
40
+ "extract",
41
+ # Result types
42
+ "ExtractedEntities",
43
+ "Indicator",
44
+ "ThreatActor",
45
+ # Pluggable sources
46
+ "ActorAliases",
47
+ "MalwareNames",
48
+ # Individual extractors
49
+ "extract_ips",
50
+ "extract_domains",
51
+ "extract_urls",
52
+ "extract_emails",
53
+ "extract_filenames",
54
+ "extract_hashes",
55
+ "extract_cves",
56
+ "extract_mitre_techniques",
57
+ "extract_mitre_procedures",
58
+ "extract_threat_actors",
59
+ "extract_malware_families",
60
+ # Utilities
61
+ "refang_text",
62
+ "__version__",
63
+ ]
iocflow/allowlists.py ADDED
@@ -0,0 +1,177 @@
1
+ """Allowlists and blocklists used to suppress false positives.
2
+
3
+ These are deliberately exported as plain module-level sets so callers can
4
+ extend or override them, e.g.::
5
+
6
+ from iocflow import allowlists
7
+ allowlists.BENIGN_DOMAINS.add("corp.example.net")
8
+ """
9
+ from __future__ import annotations
10
+
11
+ # Benign domains to exclude during extraction. The IOC-hunt flow bypasses
12
+ # reputation filtering, so this list is intentionally broad: test/placeholder
13
+ # domains, package registries, security-vendor sites, threat-intel references,
14
+ # and major cloud/CDN infrastructure that routinely appears in reports.
15
+ BENIGN_DOMAINS = {
16
+ # Test / placeholder domains
17
+ "example.com", "example.org", "example.net", "localhost", "test.com",
18
+ "internal.local",
19
+ # Common email providers (email addresses still extracted, just not as domain IOCs)
20
+ "gmail.com", "hotmail.com", "yahoo.com", "outlook.com", "live.com",
21
+ # Package registries — legitimate infrastructure, not IOCs
22
+ "npmjs.org", "registry.npmjs.org", "yarn.npmjs.org",
23
+ "yarnpkg.com", "registry.yarnpkg.com",
24
+ "github.com", "raw.githubusercontent.com", "gist.github.com",
25
+ "pypi.org", "files.pythonhosted.org",
26
+ "rubygems.org", "nuget.org", "crates.io",
27
+ "packagist.org", "mvnrepository.com", "maven.org",
28
+ "docker.io", "docker.com", "hub.docker.com",
29
+ # Cybersecurity vendors — appear as references in reports, not IOCs
30
+ "paloaltonetworks.com", "unit42.paloaltonetworks.com",
31
+ "crowdstrike.com", "falcon.crowdstrike.com",
32
+ "mandiant.com", "cloud.google.com",
33
+ "microsoft.com", "learn.microsoft.com", "security.microsoft.com",
34
+ "cisco.com", "talosintelligence.com",
35
+ "fortinet.com", "fortiguard.com",
36
+ "sentinelone.com", "sentinellabs.com",
37
+ "trendmicro.com",
38
+ "sophos.com", "news.sophos.com",
39
+ "symantec.com", "broadcom.com",
40
+ "mcafee.com", "trellix.com",
41
+ "fireeye.com",
42
+ "elastic.co", "elastic.github.io",
43
+ "zscaler.com",
44
+ "proofpoint.com",
45
+ "checkpoint.com", "research.checkpoint.com",
46
+ "recordedfuture.com",
47
+ "sekoia.io",
48
+ "group-ib.com",
49
+ "kaspersky.com", "securelist.com",
50
+ "eset.com", "welivesecurity.com",
51
+ "bitdefender.com",
52
+ "malwarebytes.com",
53
+ "cybereason.com",
54
+ "rapid7.com",
55
+ "qualys.com",
56
+ "tenable.com",
57
+ "dragos.com",
58
+ "volexity.com",
59
+ "huntress.com",
60
+ "infoblox.com",
61
+ # Threat-intel / research references
62
+ "mitre.org", "attack.mitre.org", "cve.mitre.org",
63
+ "krebsonsecurity.com",
64
+ "bleepingcomputer.com",
65
+ "thehackernews.com",
66
+ "therecord.media",
67
+ "darkreading.com",
68
+ "securityweek.com",
69
+ "threatpost.com",
70
+ "cyberscoop.com",
71
+ "schneier.com",
72
+ "nist.gov", "nvd.nist.gov",
73
+ "cisa.gov", "us-cert.cisa.gov",
74
+ "cert.org",
75
+ "virustotal.com",
76
+ "shodan.io",
77
+ "abuse.ch", "bazaar.abuse.ch", "urlhaus.abuse.ch", "threatfox.abuse.ch",
78
+ "otx.alienvault.com", "alienvault.com",
79
+ "hybrid-analysis.com",
80
+ "any.run", "app.any.run",
81
+ "joesandbox.com", "joesecurity.org",
82
+ "urlscan.io",
83
+ "whois.domaintools.com", "domaintools.com",
84
+ "abuseipdb.com",
85
+ # Cloud / CDN infrastructure
86
+ "amazonaws.com", "azure.com", "azureedge.net",
87
+ "cloudflare.com", "cloudfront.net",
88
+ "akamai.com", "akamaitechnologies.com",
89
+ "googleapis.com",
90
+ "windows.net", "office365.com", "office.com",
91
+ "sharepoint.com", "onedrive.com",
92
+ "google.com", "gstatic.com",
93
+ "linkedin.com", "twitter.com", "x.com",
94
+ "wikipedia.org", "medium.com",
95
+ }
96
+
97
+ # Package-registry hosts: the host is benign infrastructure, but a *path* under
98
+ # it (registry.npmjs.org/<pkg>) can name a malicious package, so URL paths on
99
+ # these hosts are kept even though the bare host is in BENIGN_DOMAINS.
100
+ PACKAGE_REGISTRY_HOSTS = {
101
+ "npmjs.org", "registry.npmjs.org", "yarn.npmjs.org",
102
+ "yarnpkg.com", "registry.yarnpkg.com",
103
+ "pypi.org", "files.pythonhosted.org",
104
+ "rubygems.org", "nuget.org", "crates.io",
105
+ "packagist.org", "mvnrepository.com", "maven.org",
106
+ }
107
+
108
+ # Known benign IPs to exclude (loopback, broadcast, public resolvers).
109
+ BENIGN_IPS = {
110
+ "127.0.0.1", "0.0.0.0", "255.255.255.255",
111
+ "8.8.8.8", "8.8.4.4", # Google DNS
112
+ "1.1.1.1", "1.0.0.1", # Cloudflare DNS
113
+ }
114
+
115
+ # Extensions that are also valid TLDs. A bare "word.ext" with one of these is
116
+ # usually a filename (install.sh) rather than a domain (openclaw.ai), so it is
117
+ # only accepted as a domain when it doesn't look like a common filename.
118
+ FILE_EXTENSION_TLDS = {
119
+ "sh", "py", "pl", "rs", "ps", "cc", "md", "so", "la", "do", "to",
120
+ "ai", "st", "fm", "am", "dj", "gs", "ms", "lk", "im", "ws", "nu", "tk",
121
+ }
122
+
123
+ # Common filenames that collide with file-extension TLDs (install.sh, setup.py).
124
+ COMMON_FILENAME_STEMS = {
125
+ "install", "setup", "script", "run", "start", "init",
126
+ "main", "index", "test", "build", "deploy", "config",
127
+ }
128
+
129
+ # Common English words that happen to be MITRE malware/tool names.
130
+ MALWARE_BLOCKLIST = {
131
+ "anchor", "chaos", "empire", "expand", "flame", "havoc",
132
+ "james", "kevin", "milan", "mango", "meteor", "net", "ninja",
133
+ "ping", "rover", "royal", "ruler", "shark", "snake", "spark",
134
+ "solar", "page", "play",
135
+ }
136
+
137
+ # LOLBins — legitimate Windows/system utilities MITRE tracks as "tools".
138
+ SYSTEM_TOOL_BLOCKLIST = {
139
+ "arp", "at", "attrib", "bitsadmin", "certutil", "cipher.exe",
140
+ "cmd", "dsquery", "esentutl", "forfiles", "ftp", "ifconfig",
141
+ "ipconfig", "nbtstat", "nbtscan", "net", "netsh", "netstat",
142
+ "nltest", "ping", "psexec", "pwdump", "rclone", "reg", "route",
143
+ "schtasks", "sdelete", "systeminfo", "tasklist", "tor", "wevtutil",
144
+ "connectwise", "quick assist",
145
+ }
146
+
147
+ # Well-known threat-actor and ransomware names matched by exact word boundary.
148
+ WELL_KNOWN_ACTORS = [
149
+ # APT groups
150
+ "Lazarus", "Lazarus Group",
151
+ "Fancy Bear", "Cozy Bear",
152
+ "Sandworm", "Turla",
153
+ "Kimsuky", "Charming Kitten",
154
+ "OceanLotus", "Ocean Lotus",
155
+ "Equation Group",
156
+ "Scattered Spider",
157
+ "Nobelium", "Midnight Blizzard",
158
+ "Volt Typhoon", "Salt Typhoon",
159
+ # Ransomware families
160
+ "ALPHV", "BlackCat",
161
+ "LockBit", "Conti", "REvil",
162
+ "CrazyHunter", "Akira", "Play",
163
+ "Royal", "Black Basta", "BlackBasta",
164
+ "Cl0p", "Clop", "Cuba", "Hive",
165
+ "Medusa", "Rhysida", "BianLian",
166
+ "NoEscape", "Cactus", "Hunters International",
167
+ "Qilin", "INC Ransom", "RansomHub",
168
+ "DragonForce", "Fog", "Lynx",
169
+ ]
170
+
171
+ # Capitalized words that precede "ransomware" but are not actor names.
172
+ RANSOMWARE_FALSE_POSITIVES = {
173
+ "The", "This", "That", "New", "Old", "Some", "Any", "Each", "Our", "Their",
174
+ "Executed", "Propagated", "Deployed", "Distributed", "Disguised", "Downloaded",
175
+ "Encrypted", "Delivered", "Launched", "Installed", "Targeted", "Modified",
176
+ "Prince", # Usually "fork of Prince ransomware" — context, not actor
177
+ }
iocflow/cli.py ADDED
@@ -0,0 +1,62 @@
1
+ """Command-line entry point: ``python -m iocflow`` (or the ``iocflow`` script)."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import sys
7
+
8
+ from iocflow.extract import extract
9
+
10
+
11
+ def main(argv=None) -> int:
12
+ parser = argparse.ArgumentParser(
13
+ prog="iocflow",
14
+ description="Extract threat indicators (IOCs) from text.",
15
+ )
16
+ parser.add_argument(
17
+ "text",
18
+ nargs="*",
19
+ help="Text to extract from. If omitted, reads from stdin.",
20
+ )
21
+ parser.add_argument(
22
+ "--json",
23
+ action="store_true",
24
+ help="Emit the full result as JSON instead of a human summary.",
25
+ )
26
+ parser.add_argument(
27
+ "--no-refang",
28
+ action="store_true",
29
+ help="Do not re-fang defanged IOCs before extracting.",
30
+ )
31
+ parser.add_argument(
32
+ "--mitre",
33
+ action="store_true",
34
+ help="Load MITRE malware names for family extraction (needs iocflow[mitre]).",
35
+ )
36
+ args = parser.parse_args(argv)
37
+
38
+ text = " ".join(args.text) if args.text else sys.stdin.read()
39
+
40
+ malware_names = None
41
+ if args.mitre:
42
+ try:
43
+ from iocflow.mitre import mitre_malware_names
44
+
45
+ malware_names = mitre_malware_names()
46
+ except ImportError:
47
+ parser.error("--mitre requires the extra: pip install 'iocflow[mitre]'")
48
+
49
+ entities = extract(text, malware_names=malware_names, refang=not args.no_refang)
50
+
51
+ if args.json:
52
+ json.dump(entities.to_dict(), sys.stdout, indent=2)
53
+ sys.stdout.write("\n")
54
+ else:
55
+ print(entities.summary())
56
+ for indicator in entities.iter_indicators():
57
+ print(f" {indicator.kind:16} {indicator.value}")
58
+ return 0
59
+
60
+
61
+ if __name__ == "__main__":
62
+ raise SystemExit(main())
iocflow/extract.py ADDED
@@ -0,0 +1,99 @@
1
+ """The :func:`extract` orchestrator — run every extractor over a piece of text."""
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import re
6
+ from typing import Optional
7
+
8
+ from iocflow.extractors.actors import extract_malware_families, extract_threat_actors
9
+ from iocflow.extractors.contacts import extract_emails
10
+ from iocflow.extractors.files import extract_filenames, extract_hashes
11
+ from iocflow.extractors.network import extract_domains, extract_ips, extract_urls
12
+ from iocflow.extractors.vulns import extract_cves, extract_mitre_techniques
13
+ from iocflow.models import ExtractedEntities, ThreatActor
14
+ from iocflow.providers import ActorAliases, MalwareNames
15
+ from iocflow.refang import refang_text
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ _HTML_TAG = re.compile(r"<[^>]+>")
20
+ _WHITESPACE = re.compile(r"\s+")
21
+
22
+
23
+ def extract(
24
+ text: str,
25
+ *,
26
+ actor_aliases: Optional[ActorAliases] = None,
27
+ malware_names: Optional[MalwareNames] = None,
28
+ refang: bool = True,
29
+ ) -> ExtractedEntities:
30
+ """Extract all entity types from ``text``.
31
+
32
+ Args:
33
+ text: The text to extract from.
34
+ actor_aliases: Optional known-actor names + alias index. When given,
35
+ actor names are also matched against this set and enriched with
36
+ ``common_name`` / ``region`` / ``all_names``.
37
+ malware_names: Optional malware/tool name set. Required for
38
+ ``malware_families`` to be populated.
39
+ refang: Whether to re-fang defanged IOCs (``[.]`` -> ``.`` etc.) first.
40
+
41
+ Returns:
42
+ An :class:`~iocflow.models.ExtractedEntities`.
43
+ """
44
+ if not text:
45
+ return ExtractedEntities()
46
+
47
+ # Strip HTML and collapse whitespace.
48
+ clean = _HTML_TAG.sub(" ", text)
49
+ clean = _WHITESPACE.sub(" ", clean)
50
+ if refang:
51
+ clean = refang_text(clean)
52
+
53
+ raw_actors = extract_threat_actors(clean, actor_aliases)
54
+ enriched = _enrich_actors(raw_actors, actor_aliases)
55
+
56
+ urls = extract_urls(clean)
57
+ entities = ExtractedEntities(
58
+ ips=extract_ips(clean),
59
+ domains=extract_domains(clean),
60
+ urls=urls,
61
+ filenames=extract_filenames(clean, urls=urls),
62
+ hashes=extract_hashes(clean),
63
+ cves=extract_cves(clean),
64
+ emails=extract_emails(clean),
65
+ threat_actors=raw_actors,
66
+ threat_actors_enriched=enriched,
67
+ malware_families=extract_malware_families(clean, malware_names),
68
+ mitre_techniques=extract_mitre_techniques(clean),
69
+ )
70
+
71
+ if not entities.is_empty():
72
+ logger.info("Extracted entities: %s", entities.summary())
73
+ return entities
74
+
75
+
76
+ def _enrich_actors(
77
+ raw_actors: list, actor_aliases: Optional[ActorAliases]
78
+ ) -> list:
79
+ """Map raw actor names to :class:`ThreatActor`, de-duplicating by common name."""
80
+ enriched = []
81
+ seen_common = set()
82
+ for name in raw_actors:
83
+ info = actor_aliases.lookup(name) if actor_aliases else None
84
+ if info:
85
+ common = info.get("common_name", name)
86
+ if common.lower() in seen_common:
87
+ continue
88
+ seen_common.add(common.lower())
89
+ enriched.append(
90
+ ThreatActor(
91
+ name=name,
92
+ common_name=common,
93
+ region=info.get("region", ""),
94
+ all_names=info.get("all_names", []),
95
+ )
96
+ )
97
+ else:
98
+ enriched.append(ThreatActor(name=name, common_name=name))
99
+ return enriched
@@ -0,0 +1,35 @@
1
+ """Individual, composable extraction functions.
2
+
3
+ Each function takes text and returns a list/dict of one entity type. They are
4
+ re-exported from the top-level :mod:`iocflow` package for convenience.
5
+ """
6
+ from iocflow.extractors.actors import (
7
+ extract_malware_families,
8
+ extract_threat_actors,
9
+ )
10
+ from iocflow.extractors.contacts import extract_emails
11
+ from iocflow.extractors.files import extract_filenames, extract_hashes
12
+ from iocflow.extractors.network import (
13
+ extract_domains,
14
+ extract_ips,
15
+ extract_urls,
16
+ )
17
+ from iocflow.extractors.vulns import (
18
+ extract_cves,
19
+ extract_mitre_procedures,
20
+ extract_mitre_techniques,
21
+ )
22
+
23
+ __all__ = [
24
+ "extract_ips",
25
+ "extract_domains",
26
+ "extract_urls",
27
+ "extract_emails",
28
+ "extract_filenames",
29
+ "extract_hashes",
30
+ "extract_cves",
31
+ "extract_mitre_techniques",
32
+ "extract_mitre_procedures",
33
+ "extract_threat_actors",
34
+ "extract_malware_families",
35
+ ]
@@ -0,0 +1,107 @@
1
+ """Threat-actor and malware-family extraction.
2
+
3
+ Actor extraction works with zero external data via patterns and a curated
4
+ well-known list. Pass an :class:`~iocflow.providers.ActorAliases` to also match
5
+ a custom name set, and a :class:`~iocflow.providers.MalwareNames` to enable
6
+ malware-family extraction.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from typing import List, Optional
12
+
13
+ from iocflow.allowlists import (
14
+ MALWARE_BLOCKLIST,
15
+ RANSOMWARE_FALSE_POSITIVES,
16
+ SYSTEM_TOOL_BLOCKLIST,
17
+ WELL_KNOWN_ACTORS,
18
+ )
19
+ from iocflow.providers import ActorAliases, MalwareNames
20
+
21
+ # Structured actor designators: APT28, APT-28, UNC2452, FIN7, TA505, DEV-0537,
22
+ # STORM-0558.
23
+ _ACTOR_PATTERNS = [
24
+ re.compile(r"\bAPT[-]?\d+\b", re.IGNORECASE),
25
+ re.compile(r"\bUNC\d+\b", re.IGNORECASE),
26
+ re.compile(r"\bFIN\d+\b", re.IGNORECASE),
27
+ re.compile(r"\bTA\d+\b", re.IGNORECASE),
28
+ re.compile(r"\bDEV-\d+\b", re.IGNORECASE),
29
+ re.compile(r"\bSTORM-\d+\b", re.IGNORECASE),
30
+ ]
31
+
32
+ # "<ProperNoun> ransomware" — case-sensitive so "go-based ransomware" misses.
33
+ _RANSOMWARE = re.compile(r"\b([A-Z][a-z]+(?:[A-Z][a-z0-9]*)*)\s+ransomware\b")
34
+
35
+ # Real APT names that are common English words and cause too many false hits.
36
+ _ACTOR_NAME_BLOCKLIST = {"lead"}
37
+
38
+
39
+ def extract_threat_actors(text: str, known: Optional[ActorAliases] = None) -> List[str]:
40
+ """Extract threat-actor names.
41
+
42
+ Strategies, in order:
43
+
44
+ 1. Exact-match a caller-supplied known-name set (if ``known`` is given).
45
+ 2. Match APT/UNC/FIN/TA/DEV/STORM designators.
46
+ 3. Match a curated list of well-known actor and ransomware names.
47
+ 4. Catch the ``"<Name> ransomware"`` pattern.
48
+ """
49
+ actors = set()
50
+
51
+ if known is not None:
52
+ for name in known.known_names:
53
+ if len(name) <= 3 or name.lower() in _ACTOR_NAME_BLOCKLIST:
54
+ continue
55
+ m = re.search(r"\b" + re.escape(name) + r"\b", text, re.IGNORECASE)
56
+ if m:
57
+ actors.add(m.group())
58
+
59
+ for pattern in _ACTOR_PATTERNS:
60
+ actors.update(m.upper() for m in pattern.findall(text))
61
+
62
+ for actor in WELL_KNOWN_ACTORS:
63
+ m = re.search(r"\b" + re.escape(actor) + r"\b", text, re.IGNORECASE)
64
+ if m:
65
+ actors.add(m.group())
66
+
67
+ for m in _RANSOMWARE.finditer(text):
68
+ name = m.group(1)
69
+ if name not in RANSOMWARE_FALSE_POSITIVES and len(name) >= 4:
70
+ actors.add(name)
71
+
72
+ return list(actors)
73
+
74
+
75
+ def extract_malware_families(text: str, malware: Optional[MalwareNames] = None) -> List[str]:
76
+ """Extract malware-family names by matching a caller-supplied name set.
77
+
78
+ Returns ``[]`` when no :class:`~iocflow.providers.MalwareNames` is given.
79
+ Three-layer false-positive defense:
80
+
81
+ 1. Skip names of 3 characters or fewer (``at``, ``cmd``, ``ftp``).
82
+ 2. Case-sensitive matching for single-word names; case-insensitive for
83
+ multi-word names.
84
+ 3. Blocklist common English words and LOLBins.
85
+
86
+ When an alias matches, the result is normalized to its canonical name.
87
+ """
88
+ if malware is None or not malware.names:
89
+ return []
90
+
91
+ matched = set()
92
+ for name in malware.names:
93
+ if len(name) <= 3:
94
+ continue
95
+ name_lower = name.lower()
96
+ if name_lower in MALWARE_BLOCKLIST or name_lower in SYSTEM_TOOL_BLOCKLIST:
97
+ continue
98
+
99
+ flags = re.IGNORECASE if " " in name else 0
100
+ if re.search(r"\b" + re.escape(name) + r"\b", text, flags):
101
+ canonical = malware.alias_map.get(name_lower, name)
102
+ canon_lower = canonical.lower()
103
+ if canon_lower in MALWARE_BLOCKLIST or canon_lower in SYSTEM_TOOL_BLOCKLIST:
104
+ continue
105
+ matched.add(canonical)
106
+
107
+ return sorted(matched)
@@ -0,0 +1,15 @@
1
+ """Contact indicators: email addresses."""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import List
6
+
7
+ _EMAIL = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b", re.IGNORECASE)
8
+
9
+ MAX_EMAILS = 20
10
+
11
+
12
+ def extract_emails(text: str) -> List[str]:
13
+ """Extract email addresses, lowercased and de-duplicated."""
14
+ matches = _EMAIL.findall(text)
15
+ return list(dict.fromkeys(e.lower() for e in matches))[:MAX_EMAILS]
@@ -0,0 +1,84 @@
1
+ """File indicators: suspicious filenames and cryptographic hashes."""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Dict, List, Optional
6
+
7
+ # Extensions that indicate a potentially malicious file. ``.com`` is excluded
8
+ # because it collides with domain names (github.com).
9
+ SUSPICIOUS_EXTENSIONS = {
10
+ # Scripts
11
+ "ps1", "sh", "bat", "cmd", "vbs", "vbe", "js", "jse", "wsf", "wsh",
12
+ # Executables (no .com — avoids domain false positives)
13
+ "exe", "dll", "msi", "scr", "pif",
14
+ # Documents with macros
15
+ "docm", "xlsm", "pptm", "dotm", "xltm",
16
+ # Archives (can carry malware)
17
+ "iso", "img", "vhd", "vhdx",
18
+ # Other
19
+ "hta", "lnk", "jar", "msc",
20
+ }
21
+
22
+ _EXT_ALTERNATION = "|".join(re.escape(ext) for ext in SUSPICIOUS_EXTENSIONS)
23
+ _FILENAME = re.compile(rf"\b([a-zA-Z0-9_\-\.]+\.(?:{_EXT_ALTERNATION}))\b", re.IGNORECASE)
24
+
25
+ MAX_FILENAMES = 20
26
+
27
+
28
+ def extract_filenames(text: str, urls: Optional[List[str]] = None) -> List[str]:
29
+ """Extract suspicious script/executable filenames from text and URLs."""
30
+ filenames: List[str] = []
31
+ seen = set()
32
+
33
+ for match in _FILENAME.finditer(text):
34
+ filename = match.group(1)
35
+ if filename.lower() not in seen:
36
+ filenames.append(filename)
37
+ seen.add(filename.lower())
38
+
39
+ if urls:
40
+ for url in urls:
41
+ path = url.split("/")[-1]
42
+ if path and "." in path:
43
+ ext = path.rsplit(".", 1)[-1].lower()
44
+ if ext in SUSPICIOUS_EXTENSIONS and path.lower() not in seen:
45
+ filenames.append(path)
46
+ seen.add(path.lower())
47
+
48
+ return filenames[:MAX_FILENAMES]
49
+
50
+
51
+ _SHA256 = re.compile(r"\b[a-fA-F0-9]{64}\b")
52
+ _SHA1 = re.compile(r"\b[a-fA-F0-9]{40}\b")
53
+ _MD5 = re.compile(r"\b[a-fA-F0-9]{32}\b")
54
+
55
+
56
+ def extract_hashes(text: str) -> Dict[str, List[str]]:
57
+ """Extract MD5, SHA1, and SHA256 hashes.
58
+
59
+ Longer hashes are matched first; shorter patterns that are merely a prefix
60
+ of an already-matched longer hash are dropped, so a SHA256 isn't also
61
+ reported as a SHA1 and an MD5.
62
+ """
63
+ hashes: Dict[str, List[str]] = {"md5": [], "sha1": [], "sha256": []}
64
+
65
+ hashes["sha256"] = list(dict.fromkeys(h.lower() for h in _SHA256.findall(text)))
66
+
67
+ sha256_prefixes_40 = {h[:40] for h in hashes["sha256"]}
68
+ hashes["sha1"] = list(
69
+ dict.fromkeys(
70
+ h.lower() for h in _SHA1.findall(text) if h.lower() not in sha256_prefixes_40
71
+ )
72
+ )
73
+
74
+ sha1_prefixes_32 = {h[:32] for h in hashes["sha1"]}
75
+ sha256_prefixes_32 = {h[:32] for h in hashes["sha256"]}
76
+ hashes["md5"] = list(
77
+ dict.fromkeys(
78
+ h.lower()
79
+ for h in _MD5.findall(text)
80
+ if h.lower() not in sha1_prefixes_32 and h.lower() not in sha256_prefixes_32
81
+ )
82
+ )
83
+
84
+ return hashes