emlpeek 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
emlpeek-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: emlpeek
3
+ Version: 1.0.0
4
+ Summary: Email investigation tool
5
+ Author-email: Your Name <you@example.com>
6
+ License: MIT
7
+ Keywords: email,investigation,forensics
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+
14
+ # emlpeek
15
+ Email investigation tool
@@ -0,0 +1,2 @@
1
+ # emlpeek
2
+ Email investigation tool
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "emlpeek"
7
+ version = "1.0.0"
8
+ description = "Email investigation tool"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Your Name", email = "you@example.com" }]
13
+ keywords = ["email", "investigation", "forensics"]
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Operating System :: OS Independent",
18
+ ]
19
+
20
+ [tool.setuptools]
21
+ package-dir = {"" = "src"}
22
+
23
+ [tool.setuptools.packages.find]
24
+ where = ["src"]
25
+ include = ["soc_email_investigator*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: emlpeek
3
+ Version: 1.0.0
4
+ Summary: Email investigation tool
5
+ Author-email: Your Name <you@example.com>
6
+ License: MIT
7
+ Keywords: email,investigation,forensics
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+
14
+ # emlpeek
15
+ Email investigation tool
@@ -0,0 +1,11 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/emlpeek.egg-info/PKG-INFO
4
+ src/emlpeek.egg-info/SOURCES.txt
5
+ src/emlpeek.egg-info/dependency_links.txt
6
+ src/emlpeek.egg-info/top_level.txt
7
+ src/soc_email_investigator/__init__.py
8
+ src/soc_email_investigator/extract.py
9
+ src/soc_email_investigator/main.py
10
+ src/soc_email_investigator/report.py
11
+ src/soc_email_investigator/rules.py
@@ -0,0 +1 @@
1
+ soc_email_investigator
File without changes
@@ -0,0 +1,125 @@
1
+ from pathlib import Path
2
+ from email import policy
3
+ from email.parser import BytesParser
4
+ from urllib.parse import urlparse
5
+ import re
6
+
7
+ URL_RE = re.compile(r"https?://[^\s\"'<>()]+", re.IGNORECASE)
8
+ EMAIL_LINE_RE = re.compile(r"^\s*([A-Z0-9._%+\-]+@[^>\s]+)\s*$", re.IGNORECASE)
9
+
10
+ def _extract_address_domain(address: str | None) -> str | None:
11
+ if not address:
12
+ return None
13
+ # Strip common forms like: "Name <user@domain.com>"
14
+ if "<" in address and ">" in address:
15
+ addr = address.split("<", 1)[1].split(">", 1)[0].strip()
16
+ else:
17
+ addr = address.strip()
18
+ if "@" not in addr:
19
+ return None
20
+ return addr.split("@", 1)[1].lower()
21
+
22
+ def _extract_urls(text: str) -> list[str]:
23
+ if not text:
24
+ return []
25
+ # Try common URL pattern
26
+ urls = URL_RE.findall(text)
27
+ # Cleanup trailing punctuation
28
+ cleaned = []
29
+ for u in urls:
30
+ cleaned.append(u.rstrip(").,;:!?\"'"))
31
+ return cleaned
32
+
33
+ def _url_host(url: str) -> str | None:
34
+ try:
35
+ parsed = urlparse(url)
36
+ host = parsed.hostname
37
+ if host:
38
+ return host
39
+ except Exception:
40
+ pass
41
+ return None
42
+
43
+ def extract_from_eml(eml_path: Path) -> dict:
44
+ raw = eml_path.read_bytes()
45
+
46
+ msg = BytesParser(policy=policy.default).parsebytes(raw)
47
+
48
+ headers_text = ""
49
+ received_lines = []
50
+ for k, v in msg.raw_items():
51
+ # raw_items returns header fields as provided; we store full header text for "Received:"
52
+ if k is not None and k.lower() == "received":
53
+ received_lines.append(str(v).strip())
54
+ # For completeness, keep a header text dump too
55
+ headers_text += f"{k}: {v}\n" if k else f"{v}\n"
56
+
57
+ # Body extraction (best-effort)
58
+ body_parts = []
59
+ if msg.is_multipart():
60
+ for part in msg.walk():
61
+ ctype = part.get_content_type()
62
+ disp = str(part.get("Content-Disposition") or "")
63
+ if ctype in ("text/plain", "text/html") and "attachment" not in disp.lower():
64
+ try:
65
+ body_parts.append(part.get_content())
66
+ except Exception:
67
+ # fallback decode
68
+ payload = part.get_payload(decode=True) or b""
69
+ charset = part.get_content_charset() or "utf-8"
70
+ body_parts.append(payload.decode(charset, errors="replace"))
71
+ else:
72
+ try:
73
+ body_parts.append(msg.get_content())
74
+ except Exception:
75
+ payload = msg.get_payload(decode=True) or b""
76
+ charset = msg.get_content_charset() or "utf-8"
77
+ body_parts.append(payload.decode(charset, errors="replace"))
78
+
79
+ body_text = "\n".join([str(x) for x in body_parts if x is not None])
80
+
81
+ from_hdr = msg.get("From")
82
+ reply_to_hdr = msg.get("Reply-To")
83
+ to_hdr = msg.get("To")
84
+ subject = msg.get("Subject")
85
+ date = msg.get("Date")
86
+ authentication_results = msg.get("Authentication-Results")
87
+
88
+ from_domain = _extract_address_domain(from_hdr)
89
+ reply_to_domain = _extract_address_domain(reply_to_hdr)
90
+
91
+ urls = _extract_urls(body_text + "\n" + headers_text)
92
+
93
+ url_indicators = []
94
+ for u in urls:
95
+ host = _url_host(u)
96
+ host_type = "unknown"
97
+ if host:
98
+ if re.fullmatch(r"\d{1,3}(\.\d{1,3}){3}", host):
99
+ host_type = "ip_literal"
100
+ elif host.startswith("xn--"):
101
+ host_type = "punycode"
102
+ else:
103
+ host_type = "hostname"
104
+ url_indicators.append({
105
+ "url": u,
106
+ "host": host,
107
+ "host_type": host_type
108
+ })
109
+
110
+ indicators = {
111
+ "email": {
112
+ "from": from_hdr,
113
+ "from_domain": from_domain,
114
+ "reply_to": reply_to_hdr,
115
+ "reply_to_domain": reply_to_domain,
116
+ "to": to_hdr,
117
+ "subject": subject,
118
+ "date": date,
119
+ "authentication_results": authentication_results,
120
+ },
121
+ "received_lines": received_lines,
122
+ "urls": url_indicators
123
+ }
124
+
125
+ return {"indicators": indicators}
@@ -0,0 +1,62 @@
1
+ import argparse
2
+ from pathlib import Path
3
+ from datetime import datetime, timezone
4
+ import json
5
+ import hashlib
6
+
7
+ from .extract import extract_from_eml
8
+ from .rules import run_rules
9
+ from .report import write_outputs
10
+
11
+
12
+ TOOL_VERSION = "v1.0"
13
+ TOOL_NAME = "emlpeek "
14
+
15
+ def sha256_file(p: Path) -> str:
16
+ h = hashlib.sha256()
17
+ with p.open("rb") as f:
18
+ for chunk in iter(lambda: f.read(1024 * 1024), b""):
19
+ h.update(chunk)
20
+ return h.hexdigest()
21
+
22
+ def main():
23
+ ap = argparse.ArgumentParser()
24
+ ap.add_argument("input_path")
25
+ ap.add_argument("--out", required=True)
26
+ args = ap.parse_args()
27
+
28
+ in_path = Path(args.input_path)
29
+ out_dir = Path(args.out)
30
+ out_dir.mkdir(parents=True, exist_ok=True)
31
+
32
+ eml_data = extract_from_eml(in_path)
33
+ signals = run_rules(eml_data)
34
+
35
+ evidence_manifest = {
36
+ "tool_name" :TOOL_NAME,
37
+ "tool_version": TOOL_VERSION,
38
+ "input": {
39
+ "path": str(in_path),
40
+ "filename": in_path.name,
41
+ "size_bytes": in_path.stat().st_size,
42
+ "sha256": sha256_file(in_path),
43
+ },
44
+ "run_utc": datetime.now(timezone.utc).isoformat()
45
+ }
46
+
47
+ write_outputs(
48
+ out_dir=out_dir,
49
+ indicators=eml_data["indicators"],
50
+ signals=signals,
51
+ verdict=signals["verdict"],
52
+ confidence=signals["confidence"],
53
+ report_context=signals["context"],
54
+ evidence_manifest=evidence_manifest,
55
+ )
56
+
57
+ if __name__ == "__main__":
58
+ main()
59
+
60
+
61
+
62
+
@@ -0,0 +1,59 @@
1
+ import json
2
+ from datetime import datetime, timezone
3
+
4
+ def write_outputs(out_dir, indicators, signals, verdict, confidence, report_context, evidence_manifest):
5
+ # Save indicators
6
+ (out_dir / "indicators.json").write_text(
7
+ json.dumps(indicators, indent=2, ensure_ascii=False),
8
+ encoding="utf-8"
9
+ )
10
+
11
+ # Save signals
12
+ signals_out = {
13
+ "verdict": verdict,
14
+ "confidence": confidence,
15
+ "context": report_context
16
+ }
17
+ (out_dir / "signals.json").write_text(
18
+ json.dumps(signals_out, indent=2, ensure_ascii=False),
19
+ encoding="utf-8"
20
+ )
21
+
22
+ # Save evidence manifest
23
+ (out_dir / "evidence_manifest.json").write_text(
24
+ json.dumps(evidence_manifest, indent=2, ensure_ascii=False),
25
+ encoding="utf-8"
26
+ )
27
+
28
+ # Save report
29
+ run_utc = evidence_manifest.get("run_utc", "")
30
+ from_domain = indicators["email"].get("from_domain")
31
+ urls = indicators.get("urls", [])
32
+
33
+ lines = []
34
+ lines.append(f"# Email Investigation Report")
35
+ lines.append("")
36
+ lines.append(f"- **Verdict:** {verdict}")
37
+ lines.append(f"- **Confidence:** {confidence:.2f}")
38
+ if run_utc:
39
+ lines.append(f"- **Run (UTC):** {run_utc}")
40
+ lines.append("")
41
+ lines.append("## Key evidence")
42
+ lines.append(f"- From domain: {from_domain}")
43
+ lines.append(f"- URLs found: {len(urls)}")
44
+ if urls:
45
+ # list first few hosts/urls
46
+ for i, u in enumerate(urls[:10], start=1):
47
+ lines.append(f" {i}. {u.get('url')} (host: {u.get('host')}, type: {u.get('host_type')})")
48
+ lines.append("")
49
+ lines.append("## Why this verdict")
50
+ if report_context:
51
+ for c in report_context:
52
+ lines.append(f"- {c}")
53
+ else:
54
+ lines.append("- (no context recorded)")
55
+ lines.append("")
56
+ lines.append("## Notes")
57
+ lines.append("- This is a rule-based system for v1 (it does not guarantee guilt/innocence).")
58
+
59
+ (out_dir / "report.md").write_text("\n".join(lines), encoding="utf-8")
@@ -0,0 +1,63 @@
1
+ def run_rules(eml_data: dict) -> dict:
2
+ indicators = eml_data["indicators"]
3
+ email = indicators["email"]
4
+
5
+ urls = indicators.get("urls", [])
6
+ from_domain = email.get("from_domain")
7
+ reply_to_domain = email.get("reply_to_domain")
8
+
9
+ verdict = "uncertain"
10
+ confidence = 0.0
11
+ context = []
12
+
13
+ has_sender = bool(from_domain or reply_to_domain)
14
+ has_urls = len(urls) > 0
15
+
16
+ if not has_urls and not has_sender:
17
+ context.append("No URLs and no From/Reply-To domains found.")
18
+ return {"verdict": "uncertain", "confidence": 0.2, "context": context}
19
+
20
+ # choose base domain
21
+ base_domain = from_domain or reply_to_domain
22
+
23
+ # Rule 1: IP literal in URL => phishing
24
+ for u in urls:
25
+ if u.get("host_type") == "ip_literal":
26
+ verdict = "phishing"
27
+ confidence = 0.95
28
+ context.append(f"Found URL with IP-literal host: {u.get('host')}")
29
+ return {"verdict": verdict, "confidence": confidence, "context": context}
30
+
31
+ # Rule 2: suspicious host not related to sender domain (simple substring heuristic)
32
+ if has_urls and base_domain:
33
+ base = base_domain.lower()
34
+ for u in urls:
35
+ host = (u.get("host") or "").lower()
36
+ if not host:
37
+ continue
38
+ # if host doesn't share a substring with sender base domain, flag
39
+ if base not in host and host not in base:
40
+ verdict = "phishing"
41
+ confidence = 0.75
42
+ context.append(f"URL host {host} does not relate to sender domain {base}.")
43
+ break
44
+
45
+ # Rule 3: decide legit/uncertain if not phishing
46
+ if verdict != "phishing":
47
+ if has_urls and base_domain and len(context) == 0:
48
+ verdict = "legit"
49
+ confidence = 0.65
50
+ context.append("URLs found and no strong phishing indicators matched.")
51
+ else:
52
+ verdict = "uncertain"
53
+ confidence = 0.45
54
+ context.append("Not enough evidence to label as phishing/legit confidently.")
55
+
56
+ # Always record authentication-results presence (no guessing)
57
+ auth = email.get("authentication_results")
58
+ if auth:
59
+ context.append("Authentication-Results header is present (no pass/fail inferred).")
60
+ else:
61
+ context.append("Authentication-Results header not found (may reduce confidence).")
62
+
63
+ return {"verdict": verdict, "confidence": confidence, "context": context}