mailsense 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mailsense/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ # Copyright 2026 Samapriya Roy
2
+ # Apache 2.0 License
3
+ """
4
+ mailsense — Automated mail intelligence pipeline.
5
+ Gmail → mbox → image extraction → Gemini AI analysis.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "Samapriya Roy"
mailsense/cli.py ADDED
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright 2026 Samapriya Roy
3
+ # Apache 2.0 License
4
+ """
5
+ mailsense — Automated mail intelligence pipeline.
6
+
7
+ Turns Gmail USPS Informed Delivery emails into structured JSON
8
+ using Gemini AI across three stages:
9
+
10
+ config — Store credentials and defaults in ~/.mailsense
11
+ download — Gmail label → .mbox files
12
+ extract — .mbox files → images + metadata
13
+ analyze — images → structured JSON via Gemini AI
14
+ pipeline — Run all three stages end-to-end
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import sys
21
+
22
+ from mailsense import __version__
23
+ from mailsense import config as _config
24
+ from mailsense.commands import (
25
+ config_cmd,
26
+ download,
27
+ extract,
28
+ analyze,
29
+ pipeline,
30
+ )
31
+
32
+
33
+ def build_parser() -> argparse.ArgumentParser:
34
+ parser = argparse.ArgumentParser(
35
+ prog="mailsense",
36
+ description=__doc__,
37
+ formatter_class=argparse.RawDescriptionHelpFormatter,
38
+ )
39
+ parser.add_argument(
40
+ "--version", "-V",
41
+ action="version",
42
+ version=f"mailsense {__version__}",
43
+ )
44
+
45
+ subparsers = parser.add_subparsers(dest="command", metavar="COMMAND")
46
+ subparsers.required = True
47
+
48
+ config_cmd.add_subparser(subparsers)
49
+ download.add_subparser(subparsers)
50
+ extract.add_subparser(subparsers)
51
+ analyze.add_subparser(subparsers)
52
+ pipeline.add_subparser(subparsers)
53
+
54
+ return parser
55
+
56
+
57
+ def main() -> None:
58
+ parser = build_parser()
59
+ args = parser.parse_args()
60
+ cfg = _config.load()
61
+
62
+ dispatch = {
63
+ "config": config_cmd.run,
64
+ "download": download.run,
65
+ "extract": extract.run,
66
+ "analyze": analyze.run,
67
+ "pipeline": pipeline.run,
68
+ }
69
+
70
+ handler = dispatch.get(args.command)
71
+ if handler is None:
72
+ parser.print_help()
73
+ sys.exit(1)
74
+
75
+ try:
76
+ handler(args, cfg)
77
+ except KeyboardInterrupt:
78
+ print("\nInterrupted.", file=sys.stderr)
79
+ sys.exit(130)
80
+ except Exception as exc:
81
+ print(f"\nError: {exc}", file=sys.stderr)
82
+ sys.exit(1)
83
+
84
+
85
+ if __name__ == "__main__":
86
+ main()
@@ -0,0 +1 @@
1
+ # Copyright 2026 Samapriya Roy
@@ -0,0 +1,395 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright 2026 Samapriya Roy
3
+ # Apache 2.0 License
4
+ """
5
+ Analyze mail images with Gemini AI.
6
+
7
+ Reads images + metadata.json produced by the extract step and uses
8
+ the Gemini API to extract structured information from each mail image.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import os
15
+ import sys
16
+ import time
17
+ from pathlib import Path
18
+ from typing import Optional
19
+
20
+ try:
21
+ import PIL.Image
22
+ HAS_PIL = True
23
+ except ImportError:
24
+ HAS_PIL = False
25
+
26
+ try:
27
+ from google import genai
28
+ from google.genai import types as genai_types
29
+ HAS_GENAI = True
30
+ except ImportError:
31
+ HAS_GENAI = False
32
+
33
+ try:
34
+ from rich.console import Console
35
+ from rich.panel import Panel
36
+ from rich.progress import (BarColumn, Progress, SpinnerColumn,
37
+ TaskProgressColumn, TextColumn,
38
+ TimeRemainingColumn)
39
+ from rich.table import Table
40
+ HAS_RICH = True
41
+ except ImportError:
42
+ HAS_RICH = False
43
+
44
+ # ── Constants ──────────────────────────────────────────────────────────────────
45
+
46
+ VALID_EXTENSIONS = (".jpg", ".jpeg", ".png", ".webp")
47
+ IGNORE_PREFIXES = ("content", "mailer", "ra")
48
+ DEFAULT_MODEL = "gemini-2.0-flash"
49
+
50
+ EXTRACTION_PROMPT = """
51
+ Analyze this image of mail and extract all relevant information.
52
+
53
+ 1. Identify if it is a marketing flyer (look for 'PRSRT STD', headshots,
54
+ promotional language, or 'Current Resident').
55
+ 2. If it IS a flyer: Set 'status' to 'Ignored' and 'is_marketing' to true.
56
+ 3. If it is OFFICIAL mail (bills, tax forms, First Class, invoices, statements):
57
+ Set 'status' to 'Processed' and 'is_marketing' to false.
58
+
59
+ Extract:
60
+ - sender: name, organization, address (street, city, state, zip_code)
61
+ - recipient: name/names, address (street, city, state, zip_code)
62
+ - postage_details: type/service_class, status, amount, date, permit info
63
+ - document_info: document type, visible form fields, reference numbers, tracking codes
64
+ - content_summary: 1-2 sentence summary of what this mail is about and its purpose
65
+ - If marketing: what is being promoted and the call to action
66
+ - If official: type of document and purpose (e.g. "Tax form 1099-MISC for payment reporting")
67
+
68
+ Return ONLY a valid JSON object. No markdown, no code blocks, no explanatory text.
69
+ """
70
+
71
+
72
+ # ── Helpers ────────────────────────────────────────────────────────────────────
73
+
74
+ def _is_wanted(filename: str) -> bool:
75
+ lower = filename.lower()
76
+ return (
77
+ Path(lower).suffix in VALID_EXTENSIONS
78
+ and not lower.startswith(IGNORE_PREFIXES)
79
+ )
80
+
81
+
82
+ def _load_metadata(path: Path) -> dict:
83
+ with open(path, encoding="utf-8") as f:
84
+ return json.load(f)
85
+
86
+
87
+ def _find_email_meta(filename: str, metadata: dict) -> Optional[dict]:
88
+ for entry in metadata.get("emails", []):
89
+ for image in entry.get("images", []):
90
+ if image["filename"] == filename:
91
+ return {
92
+ "date": entry.get("date"),
93
+ "date_iso": entry.get("date_iso"),
94
+ "subject": entry.get("subject"),
95
+ "from": entry.get("from"),
96
+ "message_id": entry.get("message_id"),
97
+ "index": entry.get("index"),
98
+ }
99
+ return None
100
+
101
+
102
+ def _clean_json(text: str) -> str:
103
+ text = text.strip()
104
+ for fence in ("```json", "```"):
105
+ if text.startswith(fence):
106
+ text = text[len(fence):]
107
+ if text.endswith("```"):
108
+ text = text[:-3]
109
+ return text.strip()
110
+
111
+
112
+ def _collect_work_units(input_dir: Path) -> list[tuple[Path, Path]]:
113
+ flat = input_dir / "metadata.json"
114
+ if flat.exists():
115
+ return [(input_dir, flat)]
116
+ units = []
117
+ for sub in sorted(input_dir.iterdir()):
118
+ if sub.is_dir():
119
+ meta = sub / "metadata.json"
120
+ if meta.exists():
121
+ units.append((sub, meta))
122
+ if not units:
123
+ print(f"Error: no metadata.json found in '{input_dir}' or its subdirectories.",
124
+ file=sys.stderr)
125
+ sys.exit(1)
126
+ return units
127
+
128
+
129
+ def _call_gemini(client, model_name: str, image_path: Path) -> str:
130
+ """
131
+ Call the Gemini API using the current google-genai SDK.
132
+ PIL images are passed directly — the SDK handles serialization.
133
+ """
134
+ img = PIL.Image.open(image_path)
135
+
136
+ response = client.models.generate_content(
137
+ model=model_name,
138
+ contents=[EXTRACTION_PROMPT, img],
139
+ config=genai_types.GenerateContentConfig(
140
+ response_mime_type="application/json",
141
+ temperature=0.1,
142
+ ),
143
+ )
144
+ return response.text
145
+
146
+
147
+ # ── Core processor ─────────────────────────────────────────────────────────────
148
+
149
+ def _process_folder(
150
+ image_dir: Path,
151
+ metadata_path: Path,
152
+ output_dir: Path,
153
+ client,
154
+ model_name: str,
155
+ delay: float,
156
+ dry_run: bool,
157
+ progress=None,
158
+ outer_task=None,
159
+ ) -> dict:
160
+ metadata = _load_metadata(metadata_path)
161
+ output_dir.mkdir(parents=True, exist_ok=True)
162
+ image_files = sorted(f for f in os.listdir(image_dir) if _is_wanted(f))
163
+ total = len(image_files)
164
+ processed = skipped = errors = 0
165
+ error_list: list[dict] = []
166
+
167
+ inner_task = None
168
+ if progress:
169
+ inner_task = progress.add_task(f"[cyan]{image_dir.name}", total=total)
170
+
171
+ for idx, filename in enumerate(image_files, 1):
172
+ if progress and inner_task is not None:
173
+ progress.update(inner_task,
174
+ description=f"[cyan]{image_dir.name} — {filename[:35]}")
175
+
176
+ out_file = output_dir / f"{Path(filename).stem}.json"
177
+
178
+ if out_file.exists():
179
+ print(f" ⏭ Skipping {filename} (already processed)")
180
+ skipped += 1
181
+ if progress and inner_task is not None:
182
+ progress.update(inner_task, advance=1)
183
+ continue
184
+
185
+ if dry_run:
186
+ print(f" [dry-run] Would process: {filename}")
187
+ processed += 1
188
+ if progress and inner_task is not None:
189
+ progress.update(inner_task, advance=1)
190
+ continue
191
+
192
+ cleaned = ""
193
+ try:
194
+ email_meta = _find_email_meta(filename, metadata)
195
+ raw_text = _call_gemini(client, model_name, image_dir / filename)
196
+ cleaned = _clean_json(raw_text)
197
+ data = json.loads(cleaned)
198
+ data["filename"] = filename
199
+ if email_meta:
200
+ data["mail_metadata"] = email_meta
201
+ out_file.write_text(json.dumps(data, indent=2), encoding="utf-8")
202
+ processed += 1
203
+ print(f" ✓ [{idx}/{total}] {filename}")
204
+ except json.JSONDecodeError as e:
205
+ errors += 1
206
+ error_list.append({"filename": filename,
207
+ "error": f"JSON parse: {e}",
208
+ "raw": cleaned[:400]})
209
+ print(f" ✗ JSON error — {filename}: {e}", file=sys.stderr)
210
+ except Exception as e:
211
+ errors += 1
212
+ error_list.append({"filename": filename, "error": str(e)})
213
+ print(f" ✗ Error — {filename}: {e}", file=sys.stderr)
214
+ finally:
215
+ if progress and inner_task is not None:
216
+ progress.update(inner_task, advance=1)
217
+ time.sleep(delay)
218
+
219
+ summary = {
220
+ "source_dir": str(image_dir),
221
+ "output_dir": str(output_dir),
222
+ "processing_date": time.strftime("%Y-%m-%d %H:%M:%S"),
223
+ "total_files": total,
224
+ "processed": processed,
225
+ "skipped": skipped,
226
+ "errors": errors,
227
+ "error_details": error_list,
228
+ }
229
+ # (output_dir / "_summary.json").write_text(
230
+ # json.dumps(summary, indent=2), encoding="utf-8"
231
+ # )
232
+ if progress and outer_task is not None:
233
+ progress.update(outer_task, advance=1)
234
+ return summary
235
+
236
+
237
+ def _display_summary_plain(summaries: list[dict]) -> None:
238
+ print("\n── Analysis Summary ──────────────────────────────────")
239
+ print(f" {'Folder':<30} {'Total':>6} {'Done':>6} {'Skip':>6} {'Err':>5}")
240
+ print(" " + "─" * 60)
241
+ for s in summaries:
242
+ name = Path(s["source_dir"]).name
243
+ print(f" {name:<30} {s['total_files']:>6} {s['processed']:>6} "
244
+ f"{s['skipped']:>6} {s['errors']:>5}")
245
+ if len(summaries) > 1:
246
+ totals = {k: sum(x[k] for x in summaries)
247
+ for k in ("total_files", "processed", "skipped", "errors")}
248
+ print(" " + "─" * 60)
249
+ print(f" {'TOTAL':<30} {totals['total_files']:>6} {totals['processed']:>6} "
250
+ f"{totals['skipped']:>6} {totals['errors']:>5}")
251
+
252
+
253
+ def analyze(
254
+ input_dir: Path,
255
+ output_dir: Path,
256
+ api_key: str,
257
+ model_name: str = DEFAULT_MODEL,
258
+ delay: float = 4.0,
259
+ dry_run: bool = False,
260
+ ) -> list[dict]:
261
+ """
262
+ Public API: analyze all images in input_dir using Gemini.
263
+ Returns list of per-folder summary dicts.
264
+ """
265
+ if not dry_run:
266
+ if not HAS_GENAI:
267
+ raise ImportError(
268
+ "google-genai not installed. pip install google-genai"
269
+ )
270
+ if not HAS_PIL:
271
+ raise ImportError("Pillow not installed. pip install Pillow")
272
+ client = genai.Client(api_key=api_key)
273
+ else:
274
+ client = None
275
+
276
+ work_units = _collect_work_units(input_dir)
277
+ batch = len(work_units) > 1
278
+ summaries = []
279
+
280
+ if HAS_RICH and not dry_run:
281
+ console = Console()
282
+ console.print(Panel.fit(
283
+ f"[bold cyan]Mail Analysis[/bold cyan]\n"
284
+ f"Input: {input_dir}\n"
285
+ f"Output: {output_dir}\n"
286
+ f"Model: {model_name}\n"
287
+ f"Delay: {delay}s | Folders: {len(work_units)}",
288
+ border_style="cyan",
289
+ ))
290
+ with Progress(
291
+ SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
292
+ BarColumn(), TextColumn("{task.completed}/{task.total}"),
293
+ TaskProgressColumn(), TimeRemainingColumn(),
294
+ console=console, transient=False,
295
+ ) as progress:
296
+ outer_task = (
297
+ progress.add_task("[bold]Overall", total=len(work_units))
298
+ if batch else None
299
+ )
300
+ for image_dir, meta_path in work_units:
301
+ rel = image_dir.relative_to(input_dir) if batch else Path(".")
302
+ out_subdir = output_dir / rel
303
+ summary = _process_folder(
304
+ image_dir, meta_path, out_subdir,
305
+ client, model_name, delay, dry_run,
306
+ progress=progress, outer_task=outer_task,
307
+ )
308
+ summaries.append(summary)
309
+
310
+ table = Table(title="Analysis Summary", header_style="bold cyan", show_lines=True)
311
+ table.add_column("Folder", style="cyan", no_wrap=True)
312
+ table.add_column("Total", style="white", justify="right")
313
+ table.add_column("Processed", style="green", justify="right")
314
+ table.add_column("Skipped", style="yellow", justify="right")
315
+ table.add_column("Errors", style="red", justify="right")
316
+ for s in summaries:
317
+ table.add_row(
318
+ Path(s["source_dir"]).name,
319
+ str(s["total_files"]),
320
+ str(s["processed"]),
321
+ str(s["skipped"]),
322
+ str(s["errors"]),
323
+ )
324
+ console.print(table)
325
+
326
+ else:
327
+ for image_dir, meta_path in work_units:
328
+ rel = image_dir.relative_to(input_dir) if batch else Path(".")
329
+ out_subdir = output_dir / rel
330
+ summary = _process_folder(
331
+ image_dir, meta_path, out_subdir,
332
+ client, model_name, delay, dry_run,
333
+ )
334
+ summaries.append(summary)
335
+ _display_summary_plain(summaries)
336
+
337
+ return summaries
338
+
339
+
340
+ # ── CLI entry ──────────────────────────────────────────────────────────────────
341
+
342
+ def add_subparser(subparsers) -> None:
343
+ p = subparsers.add_parser(
344
+ "analyze",
345
+ help="Analyze mail images with Gemini AI",
346
+ description=__doc__,
347
+ )
348
+ p.add_argument("--input-dir", "-i", required=True, metavar="DIR",
349
+ help="Output directory from the extract step")
350
+ p.add_argument("--output-dir", "-o", required=True, metavar="DIR",
351
+ help="Directory to save extracted JSON files")
352
+ p.add_argument("--api-key", "-k", default=None, metavar="KEY",
353
+ help="Gemini API key (overrides config / GEMINI_API_KEY env var)")
354
+ p.add_argument("--model", "-m", default=None, metavar="MODEL",
355
+ help=f"Gemini model name (default: {DEFAULT_MODEL})")
356
+ p.add_argument("--delay", "-d", type=float, default=None, metavar="SEC",
357
+ help="Seconds between API requests (default: 4 for free tier)")
358
+ p.add_argument("--dry-run", "-n", action="store_true",
359
+ help="Show what would be processed without calling the API")
360
+
361
+
362
+ def run(args, cfg: dict) -> None:
363
+ api_key = (
364
+ args.api_key
365
+ or cfg.get("gemini_api_key")
366
+ or os.environ.get("GEMINI_API_KEY")
367
+ )
368
+ if not api_key and not args.dry_run:
369
+ print(
370
+ "Error: no Gemini API key. Use --api-key, set GEMINI_API_KEY, "
371
+ "or run: mailsense config set gemini_api_key YOUR_KEY",
372
+ file=sys.stderr,
373
+ )
374
+ sys.exit(1)
375
+
376
+ input_dir = Path(args.input_dir)
377
+ output_dir = Path(args.output_dir)
378
+ if not input_dir.exists():
379
+ print(f"Error: input directory not found: {input_dir}", file=sys.stderr)
380
+ sys.exit(1)
381
+
382
+ model_name = args.model or cfg.get("gemini_model", DEFAULT_MODEL)
383
+ delay = (
384
+ args.delay if args.delay is not None
385
+ else float(cfg.get("api_delay", 4.0))
386
+ )
387
+
388
+ analyze(
389
+ input_dir=input_dir,
390
+ output_dir=output_dir,
391
+ api_key=api_key or "",
392
+ model_name=model_name,
393
+ delay=delay,
394
+ dry_run=args.dry_run,
395
+ )
@@ -0,0 +1,92 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright 2026 Samapriya Roy
3
+ # Apache 2.0 License
4
+ """
5
+ Config — Manage mailsense credentials and defaults.
6
+
7
+ Settings are stored in ~/.mailsense (JSON, mode 0600).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import sys
13
+
14
+
15
+ def add_subparser(subparsers) -> None:
16
+ p = subparsers.add_parser(
17
+ "config",
18
+ help="Manage credentials and defaults stored in ~/.mailsense",
19
+ description=__doc__,
20
+ )
21
+ sub = p.add_subparsers(dest="config_action", metavar="ACTION")
22
+
23
+ # config show
24
+ sub.add_parser("show", help="Print current configuration (passwords masked)")
25
+
26
+ # config set KEY VALUE
27
+ s = sub.add_parser("set", help="Set a configuration key")
28
+ s.add_argument("key", help="Configuration key (see 'show' for available keys)")
29
+ s.add_argument("value", help="Value to store")
30
+
31
+ # config unset KEY
32
+ u = sub.add_parser("unset", help="Remove a configuration key")
33
+ u.add_argument("key", help="Key to remove")
34
+
35
+ # config keys
36
+ sub.add_parser("keys", help="List all recognised configuration keys and their descriptions")
37
+
38
+ # config configure
39
+ c = sub.add_parser("configure", help="Interactive wizard to set all (or specific) config values")
40
+ c.add_argument(
41
+ "keys",
42
+ nargs="*",
43
+ metavar="KEY",
44
+ help="Optional: specific keys to configure (default: all)",
45
+ )
46
+
47
+
48
+ def run(args, cfg: dict) -> None:
49
+ from mailsense import config
50
+
51
+ action = getattr(args, "config_action", None)
52
+
53
+ if action == "show" or action is None:
54
+ data = config.load()
55
+ if not data:
56
+ print("No configuration stored yet. Run: mailsense config set <key> <value>")
57
+ return
58
+ descriptions = config.describe_keys()
59
+ print("Current configuration (~/.mailsense)")
60
+ print("─" * 48)
61
+ for key, val in data.items():
62
+ masked = "•" * len(str(val)) if "password" in key or "key" in key else str(val)
63
+ desc = descriptions.get(key, "")
64
+ print(f" {key:<22} {masked}")
65
+ if desc:
66
+ print(f" {'':22} ({desc})")
67
+
68
+ elif action == "set":
69
+ config.set_value(args.key, args.value)
70
+ masked = "•" * len(args.value) if "password" in args.key or "key" in args.key else args.value
71
+ print(f" Set {args.key!r} = {masked}")
72
+
73
+ elif action == "unset":
74
+ if config.unset(args.key):
75
+ print(f" Removed {args.key!r}")
76
+ else:
77
+ print(f" Key {args.key!r} was not set.")
78
+
79
+ elif action == "keys":
80
+ descriptions = config.describe_keys()
81
+ print("Available configuration keys:")
82
+ print("─" * 56)
83
+ for key, desc in descriptions.items():
84
+ print(f" {key:<22} {desc}")
85
+
86
+ elif action == "configure":
87
+ keys = args.keys if args.keys else None
88
+ config.configure(keys)
89
+
90
+ else:
91
+ print("Unknown action. Use: show | set | unset | keys", file=sys.stderr)
92
+ sys.exit(1)