PyPI - gmail-cleaner - Versions diffs - 0.1.0__py3-none-any.whl - Mend

gmail-cleaner 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

gmail_cleaner/__init__.py +0 -0
gmail_cleaner/analyze.py +131 -0
gmail_cleaner/auth.py +57 -0
gmail_cleaner/config.py +28 -0
gmail_cleaner/db.py +68 -0
gmail_cleaner/delete.py +341 -0
gmail_cleaner/main.py +60 -0
gmail_cleaner/sync.py +160 -0
gmail_cleaner-0.1.0.dist-info/METADATA +121 -0
gmail_cleaner-0.1.0.dist-info/RECORD +18 -0
gmail_cleaner-0.1.0.dist-info/WHEEL +5 -0
gmail_cleaner-0.1.0.dist-info/entry_points.txt +2 -0
gmail_cleaner-0.1.0.dist-info/licenses/LICENSE +21 -0
gmail_cleaner-0.1.0.dist-info/top_level.txt +2 -0
tests/__init__.py +0 -0
tests/conftest.py +50 -0
tests/test_analyze.py +61 -0
tests/test_delete.py +92 -0

gmail_cleaner/__init__.py ADDED Viewed

File without changes

gmail_cleaner/analyze.py ADDED Viewed

@@ -0,0 +1,131 @@
+import sqlite3
+import re
+from collections import Counter
+from email.utils import parseaddr
+import pandas as pd
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.feature_extraction import text as sklearn_text
+from gmail_cleaner.config import DB_FILE, AUTO_DELETE_EMAIL_PATTERNS, PROTECTED_EMAIL_PATTERNS
+def matches_pattern(email, patterns):
+    email = (email or "").lower()
+    for pattern in patterns:
+        pattern = pattern.lower()
+        if pattern.startswith("@"):
+            domain = pattern[1:]
+            if email.endswith("@" + domain) or email.endswith("." + domain):
+                return True
+        elif email == pattern:
+            return True
+    return False
+def extract_email(sender):
+    return parseaddr(sender or "")[1].lower()
+def extract_domain(sender):
+    email = extract_email(sender)
+    if "@" not in email:
+        return "(unknown)"
+    return email.split("@", 1)[1].lower()
+def canonical_email(sender):
+    email = extract_email(sender)
+    return email.lower() if email else ""
+def matches_auto_delete(email):
+    return matches_pattern(email, AUTO_DELETE_EMAIL_PATTERNS)
+def matches_protected(email):
+    return matches_pattern(email, PROTECTED_EMAIL_PATTERNS)
+def load_data():
+    conn = sqlite3.connect(DB_FILE)
+    df = pd.read_sql(
+        """
+        SELECT id, sender, subject, internal_date, snippet, body_text, deleted
+        FROM gmail_messages
+        WHERE COALESCE(deleted, 0) = 0
+        """,
+        conn,
+    )
+    conn.close()
+    df["email"] = df["sender"].fillna("").apply(canonical_email)
+    df = df[~df["email"].apply(matches_protected)]
+    df["domain"] = df["sender"].fillna("").apply(extract_domain)
+    return df
+def run_analysis():
+    df = load_data()
+    print("\n" + "=" * 100)
+    print(f"EMAILS: {len(df):,}")
+    print("=" * 100)
+    # --------------------------------------------------
+    # TOP DOMAINS
+    # --------------------------------------------------
+    print("\nTOP DOMAINS")
+    print("-" * 100)
+    for domain, count in df["domain"].value_counts().head(50).items():
+        print(f"{count:8,d}  {domain}")
+    # --------------------------------------------------
+    # TOP SENDERS
+    # --------------------------------------------------
+    print("\nTOP SENDERS")
+    print("-" * 100)
+    for sender, count in df["email"].value_counts().head(100).items():
+        print(f"{count:8,d}  {sender}")
+    # --------------------------------------------------
+    # SUBJECT WORD ANALYSIS
+    # --------------------------------------------------
+    subject_words = []
+    for subject in df["subject"].fillna(""):
+        tokens = re.findall(r"[a-zA-Z]{4,}", subject.lower())
+        subject_words.extend(tokens)
+    print("\nTOP SUBJECT WORDS")
+    print("-" * 100)
+    for word, count in Counter(subject_words).most_common(50):
+        print(f"{count:8,d}  {word}")
+    # --------------------------------------------------
+    # TF-IDF TOPICS
+    # --------------------------------------------------
+    print("\nTOPIC CLUSTERS")
+    print("-" * 100)
+    documents = df["subject"].fillna("") + " " + df["snippet"].fillna("")
+    vectorizer = TfidfVectorizer(
+        stop_words="english", max_features=5000, min_df=5)
+    if len(documents) > 0:
+        X = vectorizer.fit_transform(documents)
+        cluster_count = min(20, max(2, len(df) // 100))
+        model = MiniBatchKMeans(n_clusters=cluster_count,
+                                random_state=42, batch_size=2048)
+        model.fit(X)
+        terms = vectorizer.get_feature_names_out()
+        for cluster_id in range(cluster_count):
+            center = model.cluster_centers_[cluster_id]
+            top_indices = center.argsort()[-12:][::-1]
+            keywords = [terms[i] for i in top_indices]
+            size = (model.labels_ == cluster_id).sum()
+            print(f"\nCluster {cluster_id + 1} ({size:,} emails)")
+            print(", ".join(keywords))
+    print("\nDone.")

gmail_cleaner/auth.py ADDED Viewed

@@ -0,0 +1,57 @@
+import time
+import random
+from pathlib import Path
+from google.oauth2.credentials import Credentials
+from google.auth.transport.requests import Request
+from google_auth_oauthlib.flow import InstalledAppFlow
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
+from gmail_cleaner.config import SCOPES, TOKEN_FILE, CREDS_FILE, MAX_RETRIES
+def get_gmail_service():
+    creds = None
+    if Path(TOKEN_FILE).exists():
+        creds = Credentials.from_authorized_user_file(TOKEN_FILE, SCOPES)
+    if not creds or not creds.valid:
+        if creds and creds.expired and creds.refresh_token:
+            creds.refresh(Request())
+        else:
+            flow = InstalledAppFlow.from_client_secrets_file(
+                CREDS_FILE, SCOPES)
+            creds = flow.run_local_server(port=0)
+        with open(TOKEN_FILE, "w") as f:
+            f.write(creds.to_json())
+    return build("gmail", "v1", credentials=creds, cache_discovery=False)
+def gmail_call(request):
+    for attempt in range(MAX_RETRIES):
+        try:
+            result = request.execute(num_retries=3)
+            return result
+        except HttpError as e:
+            status = getattr(e.resp, "status", None)
+            if status in (429, 500, 502, 503, 504):
+                wait = min(60, 2 ** attempt) + random.random()
+                print(
+                    f"\nHTTP {status} retry {attempt+1}/{MAX_RETRIES} wait={wait:.1f}s")
+                time.sleep(wait)
+                continue
+            raise
+    raise RuntimeError("Max retries exceeded")
+def trash_email(gmail_id):
+    service = get_gmail_service()
+    try:
+        service.users().messages().trash(userId="me", id=gmail_id).execute()
+        return True
+    except Exception as e:
+        return str(e)

gmail_cleaner/config.py ADDED Viewed

@@ -0,0 +1,28 @@
+# config.py
+DB_FILE = "gmail.db"
+CREDS_FILE = "credentials.json"
+TOKEN_FILE = "token.json"
+SCOPES = [
+    "https://www.googleapis.com/auth/gmail.modify"
+]
+MAX_WORKERS = 20
+MAX_RETRIES = 8
+COMMIT_BATCH = 100
+# --------------------------------------------------
+# CONFIGURABLE PATTERNS
+# --------------------------------------------------
+# Add email addresses or domains you want to automatically categorize for deletion here.
+# Example: "newsletter@example.com", "@spamdomain.com"
+AUTO_DELETE_EMAIL_PATTERNS = [
+    # Add your auto delete patterns here
+]
+# Add email addresses or domains you want to protect from accidental deletion.
+# Example: "personal@gmail.com", "@mybank.com"
+PROTECTED_EMAIL_PATTERNS = [
+    # Add your protected email patterns here
+]

gmail_cleaner/db.py ADDED Viewed

@@ -0,0 +1,68 @@
+import sqlite3
+from gmail_cleaner.config import DB_FILE
+def init_db():
+    conn = sqlite3.connect(
+        DB_FILE,
+        check_same_thread=False
+    )
+    conn.execute("""
+    CREATE TABLE IF NOT EXISTS gmail_messages (
+        id TEXT PRIMARY KEY,
+        thread_id TEXT,
+        history_id INTEGER,
+        internal_date INTEGER,
+        label_ids TEXT,
+        subject TEXT,
+        sender TEXT,
+        recipients TEXT,
+        snippet TEXT,
+        body_text TEXT,
+        raw_json TEXT,
+        created_at DATETIME DEFAULT CURRENT_TIMESTAMP
+    )
+    """)
+    conn.execute("""
+    CREATE TABLE IF NOT EXISTS sync_state (
+        key TEXT PRIMARY KEY,
+        value TEXT
+    )
+    """)
+    conn.execute("""
+    CREATE INDEX IF NOT EXISTS idx_internal_date
+    ON gmail_messages(internal_date)
+    """)
+    # From analyze.py
+    try:
+        conn.execute("""
+        ALTER TABLE gmail_messages
+        ADD COLUMN deleted INTEGER NOT NULL DEFAULT 0
+        """)
+    except sqlite3.OperationalError:
+        pass
+    conn.commit()
+    return conn
+def get_state(conn, key):
+    row = conn.execute(
+        "SELECT value FROM sync_state WHERE key=?",
+        (key,)
+    ).fetchone()
+    return row[0] if row else None
+def set_state(conn, key, value):
+    conn.execute(
+        """
+        INSERT OR REPLACE INTO sync_state(key,value)
+        VALUES (?,?)
+        """,
+        (key, str(value))
+    )

gmail_cleaner/delete.py ADDED Viewed

@@ -0,0 +1,341 @@
+import re
+import sqlite3
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import Counter
+from tqdm import tqdm
+from gmail_cleaner.config import DB_FILE
+from gmail_cleaner.auth import trash_email
+from gmail_cleaner.analyze import load_data, matches_auto_delete
+from sklearn.feature_extraction import text as sklearn_text
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import MiniBatchKMeans
+DELETE_WORKERS = 10
+PROMOTION_REGEX = r"sale|offer|discount|coupon|deal|cashback|save|limited time"
+NEWSLETTER_REGEX = r"unsubscribe|newsletter|digest|weekly|daily"
+DELETE_STRATEGIES = {
+    "1": "top_senders",
+    "2": "newsletter",
+    "3": "promotions",
+    "4": "subject_patterns",
+    "5": "topic_clusters",
+    "6": "auto_delete_matches",
+}
+pending_delete_ids = []
+def normalize_subject(subject):
+    if not subject:
+        return ""
+    subject = subject.lower()
+    subject = re.sub(r"\d+", "<num>", subject)
+    return re.sub(r"\s+", " ", subject).strip()
+def choose_delete_strategy():
+    print("\nDELETE STRATEGIES")
+    print("-" * 100)
+    print("[1] Top Senders")
+    print("[2] Newsletters")
+    print("[3] Promotions")
+    print("[4] Subject Patterns")
+    print("[5] Topic Clusters")
+    print("[6] Auto Delete Matches")
+    print("[q] Quit")
+    while True:
+        choice = input("\nStrategy: ").strip().lower()
+        if choice == "q":
+            return None
+        if choice in DELETE_STRATEGIES:
+            return DELETE_STRATEGIES[choice]
+        print("Invalid choice")
+# Strategy Builders
+def get_top_sender_candidates(df):
+    return df["email"].value_counts().head(100).items()
+def get_newsletter_candidates(df):
+    newsletter_df = df[
+        (df["subject"].fillna("").str.contains(NEWSLETTER_REGEX, case=False, regex=True)) |
+        (df["body_text"].fillna("").str.contains(
+            NEWSLETTER_REGEX, case=False, regex=True))
+    ]
+    return newsletter_df["email"].value_counts().head(100).items()
+def get_promotion_candidates(df):
+    promo_df = df[
+        (df["subject"].fillna("").str.contains(PROMOTION_REGEX, case=False, regex=True)) |
+        (df["snippet"].fillna("").str.contains(
+            PROMOTION_REGEX, case=False, regex=True))
+    ]
+    return promo_df["email"].value_counts().head(100).items()
+def get_subject_pattern_candidates(df):
+    patterns = Counter()
+    for subject in df["subject"].fillna(""):
+        patterns[normalize_subject(subject)] += 1
+    return patterns.most_common(100)
+def get_cluster_candidates(df):
+    CUSTOM_STOPWORDS = {"email", "dear", "thank", "thanks",
+                        "hi", "hello", "com", "www", "http", "https"}
+    documents = df["subject"].fillna("")
+    stop_words = sklearn_text.ENGLISH_STOP_WORDS.union(CUSTOM_STOPWORDS)
+    vectorizer = TfidfVectorizer(stop_words=list(
+        stop_words), max_features=10000, min_df=3, ngram_range=(1, 2))
+    if len(documents) == 0:
+        return []
+    X = vectorizer.fit_transform(documents)
+    cluster_count = min(100, max(20, len(df) // 20))
+    model = MiniBatchKMeans(n_clusters=cluster_count,
+                            random_state=42, batch_size=2048)
+    model.fit(X)
+    terms = vectorizer.get_feature_names_out()
+    clusters = []
+    for cluster_id in range(cluster_count):
+        center = model.cluster_centers_[cluster_id]
+        top_indices = center.argsort()[-5:][::-1]
+        keywords = ", ".join(terms[i] for i in top_indices)
+        size = (model.labels_ == cluster_id).sum()
+        clusters.append((cluster_id, size, keywords, model.labels_))
+    return clusters
+def get_auto_delete_candidates(df):
+    auto_df = df[df["email"].apply(matches_auto_delete)]
+    return auto_df["email"].value_counts().items()
+# Selection Parsing
+def parse_selection(selection, max_index):
+    indexes = set()
+    for part in selection.split(","):
+        part = part.strip()
+        if not part:
+            continue
+        try:
+            if "-" in part:
+                start, end = part.split("-", 1)
+                if not start or not end:
+                    continue
+                for i in range(int(start), int(end) + 1):
+                    if 1 <= i <= max_index:
+                        indexes.add(i - 1)
+            else:
+                value = int(part)
+                if 1 <= value <= max_index:
+                    indexes.add(value - 1)
+        except ValueError:
+            pass
+    return indexes
+def build_pattern_deletes(sender_df, candidates, selection):
+    if selection == "q":
+        raise KeyboardInterrupt()
+    if selection == "d":
+        return []
+    if selection.startswith("k") or selection == "p":
+        keep = int(selection[1:] or "0")
+        sender_df = sender_df.sort_values("internal_date", ascending=False)
+        return sender_df.iloc[keep:]["id"].dropna().tolist()
+    try:
+        indexes = parse_selection(selection, len(candidates))
+    except Exception:
+        print("Invalid selection")
+        return []
+    delete_patterns = [candidates[idx][0] for idx in indexes]
+    delete_ids = []
+    for _, row in sender_df.iterrows():
+        subject = normalize_subject(row["subject"])
+        if subject in delete_patterns:
+            delete_ids.append(row["id"])
+    return delete_ids
+# Handlers
+def handle_sender(sender, sender_count, df):
+    sender_df = df[df["email"] == sender]
+    patterns = Counter()
+    for subject in sender_df["subject"].fillna(""):
+        patterns[normalize_subject(subject)] += 1
+    candidates = list(patterns.most_common(20))
+    print(f"\n{sender} ({sender_count:,})")
+    for i, (pattern, count) in enumerate(candidates, start=1):
+        print(f"[{i}] {count:,} {pattern[:120]}")
+    selection = input("\nChoice: ").strip()
+    return build_pattern_deletes(sender_df, candidates, selection)
+def handle_newsletter(sender, sender_count, df):
+    sender_df = df[df["email"] == sender]
+    print(f"\nNEWSLETTER: {sender}\nEmails: {sender_count:,}")
+    latest = sender_df.sort_values("internal_date", ascending=False).head(10)
+    for _, row in latest.iterrows():
+        print(row["subject"][:120])
+    choice = input("\nDelete all? [y/N]: ")
+    if choice.lower() != 'y':
+        return []
+    return sender_df["id"].tolist()
+def handle_promotion(sender, sender_count, df):
+    sender_df = df[df["email"] == sender]
+    score = sender_df["subject"].fillna("").str.contains(
+        PROMOTION_REGEX, case=False, regex=True).mean()
+    print(f"\n{sender}\nEmails : {sender_count:,}\nPromo Score: {score:.2f}")
+    choice = input("\nDelete all? [y/N]: ")
+    if choice.lower() != 'y':
+        return []
+    return sender_df["id"].tolist()
+def handle_subject_pattern(pattern, count, df):
+    print(f"\nPATTERN\n{count:,} emails\n{pattern}")
+    samples = df[df["subject"].fillna("").apply(
+        normalize_subject) == pattern]["subject"].head(10)
+    print("\nSamples")
+    for s in samples:
+        print(f"  - {s}")
+    choice = input("\nDelete all? [y/N]: ")
+    if choice.lower() != 'y':
+        return []
+    return df[df["subject"].fillna("").apply(normalize_subject) == pattern]["id"].tolist()
+def handle_cluster(cluster_id, size, keywords, labels, df):
+    print(
+        f"\nCLUSTER {cluster_id + 1}\nEmails: {size:,}\nKeywords: {keywords}")
+    choice = input("\nDelete cluster? [y/N]: ")
+    if choice.lower() != 'y':
+        return []
+    return df[labels == cluster_id]["id"].tolist()
+def handle_auto_delete(sender, sender_count, df):
+    sender_df = df[df["email"] == sender]
+    print(f"\nAUTO RULE MATCH\n{sender}\n{sender_count:,} emails")
+    latest = sender_df["subject"].dropna().head(10)
+    for s in latest:
+        print(f"  - {s}")
+    choice = input("\nDelete all? [Y/n]: ")
+    if choice.lower() == 'n':
+        return []
+    return sender_df["id"].tolist()
+def delete_gmail(ids, dry_run=False):
+    ids_to_delete = list(set(ids))
+    if not ids_to_delete:
+        return []
+    print(f"\nFinal delete pass: {len(ids_to_delete):,} emails")
+    if dry_run:
+        print("[DRY RUN] Would delete the following IDs:")
+        for gid in ids_to_delete:
+            print(f"  - {gid}")
+        return []
+    success = 0
+    failed = 0
+    successful_ids = []
+    with ThreadPoolExecutor(max_workers=DELETE_WORKERS) as executor:
+        future_to_id = {executor.submit(
+            trash_email, gid): gid for gid in ids_to_delete}
+        for future in tqdm(as_completed(future_to_id), total=len(future_to_id), desc="Deleting"):
+            gmail_id = future_to_id[future]
+            if future.result() is True:
+                success += 1
+                successful_ids.append(gmail_id)
+            else:
+                failed += 1
+    if successful_ids:
+        db_conn = sqlite3.connect(DB_FILE)
+        db_conn.executemany("UPDATE gmail_messages SET deleted = 1 WHERE id = ?", [
+                            (gid,) for gid in successful_ids])
+        db_conn.commit()
+        db_conn.close()
+    print(f"\nDeleted: {success:,}")
+    print(f"Failed : {failed:,}")
+    return [x for x in ids_to_delete if x not in successful_ids]
+def handle_delete(delete_ids, skip=False, dry_run=False):
+    global pending_delete_ids
+    pending_delete_ids.extend(delete_ids)
+    print(f"\nPending delete queue: {len(pending_delete_ids):,} emails")
+    if not skip:
+        action = input("\n[yes | now | exit] : ").strip().lower()
+        if action == "" or action == "yes" or action == "y":
+            return
+        if action == "exit":
+            return "exit"
+        if action == "now":
+            pending_delete_ids = delete_gmail(
+                pending_delete_ids, dry_run=dry_run)
+def run_delete_flow(dry_run=False):
+    global pending_delete_ids
+    df = load_data()
+    strategy = choose_delete_strategy()
+    if not strategy:
+        return
+    try:
+        if strategy == "top_senders":
+            for sender, count in get_top_sender_candidates(df):
+                ids = handle_sender(sender, count, df)
+                if handle_delete(ids, True, dry_run=dry_run) == "exit":
+                    break
+        elif strategy == "newsletter":
+            for sender, count in get_newsletter_candidates(df):
+                ids = handle_newsletter(sender, count, df)
+                if handle_delete(ids, dry_run=dry_run) == "exit":
+                    break
+        elif strategy == "promotions":
+            for sender, count in get_promotion_candidates(df):
+                ids = handle_promotion(sender, count, df)
+                if handle_delete(ids, dry_run=dry_run) == "exit":
+                    break
+        elif strategy == "subject_patterns":
+            for pattern, count in get_subject_pattern_candidates(df):
+                ids = handle_subject_pattern(pattern, count, df)
+                if handle_delete(ids, dry_run=dry_run) == "exit":
+                    break
+        elif strategy == "topic_clusters":
+            for cluster_id, size, keywords, labels in get_cluster_candidates(df):
+                ids = handle_cluster(cluster_id, size, keywords, labels, df)
+                if handle_delete(ids, dry_run=dry_run) == "exit":
+                    break
+        elif strategy == "auto_delete_matches":
+            for sender, count in get_auto_delete_candidates(df):
+                ids = handle_auto_delete(sender, count, df)
+                if handle_delete(ids, dry_run=dry_run) == "exit":
+                    break
+    except KeyboardInterrupt:
+        print("\nExiting interactive loop.")
+    if pending_delete_ids:
+        delete_gmail(pending_delete_ids, dry_run=dry_run)

gmail_cleaner/main.py ADDED Viewed

@@ -0,0 +1,60 @@
+import argparse
+from gmail_cleaner.db import init_db, get_state
+from gmail_cleaner.auth import get_gmail_service
+from gmail_cleaner.sync import full_sync, incremental_sync
+from gmail_cleaner.analyze import run_analysis
+from gmail_cleaner.delete import run_delete_flow
+def sync_command(args):
+    conn = init_db()
+    history_id = get_state(conn, "history_id")
+    if history_id is None:
+        full_sync(conn)
+    else:
+        incremental_sync(conn, history_id)
+    total = conn.execute("SELECT COUNT(*) FROM gmail_messages").fetchone()[0]
+    print(f"\nDatabase now contains {total:,} emails")
+    conn.close()
+def analyze_command(args):
+    run_analysis()
+def clean_command(args):
+    run_delete_flow(dry_run=args.dry_run)
+def main():
+    parser = argparse.ArgumentParser(
+        description="Gmail Sync and Analysis Tool")
+    subparsers = parser.add_subparsers(title="commands", dest="command")
+    subparsers.required = True
+    # Sync
+    parser_sync = subparsers.add_parser(
+        "sync", help="Synchronize emails from Gmail to local DB")
+    parser_sync.set_defaults(func=sync_command)
+    # Analyze
+    parser_analyze = subparsers.add_parser(
+        "analyze", help="Analyze downloaded emails and show statistics")
+    parser_analyze.set_defaults(func=analyze_command)
+    # Clean
+    parser_clean = subparsers.add_parser(
+        "clean", help="Interactive CLI to bulk delete emails")
+    parser_clean.add_argument("--dry-run", action="store_true",
+                              help="Simulate deletion without calling Gmail API")
+    parser_clean.set_defaults(func=clean_command)
+    args = parser.parse_args()
+    args.func(args)
+if __name__ == "__main__":
+    main()

gmail_cleaner/sync.py ADDED Viewed

@@ -0,0 +1,160 @@
+import json
+import base64
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from threading import Lock
+from tqdm import tqdm
+from gmail_cleaner.config import MAX_WORKERS, COMMIT_BATCH
+from gmail_cleaner.db import set_state
+from gmail_cleaner.auth import gmail_call, get_gmail_service
+def decode_body(data):
+    if not data:
+        return ""
+    try:
+        return base64.urlsafe_b64decode(data.encode("UTF-8")).decode("utf-8", errors="ignore")
+    except Exception:
+        return ""
+def extract_text(payload):
+    body = payload.get("body", {}).get("data")
+    if body:
+        return decode_body(body)
+    for part in payload.get("parts", []):
+        mime = part.get("mimeType")
+        if mime == "text/plain":
+            return decode_body(part.get("body", {}).get("data"))
+    return ""
+def fetch_message(message_id):
+    try:
+        service = get_gmail_service()
+        request = service.users().messages().get(
+            userId="me", id=message_id, format="metadata")
+        return gmail_call(request)
+    except Exception as e:
+        print(f"\nFAILED {message_id}: {e}")
+        return None
+def save_message(conn, msg):
+    headers = {h["name"]: h["value"]
+               for h in msg["payload"].get("headers", [])}
+    body_text = extract_text(msg["payload"])
+    conn.execute("""
+        INSERT OR REPLACE INTO gmail_messages (
+            id, thread_id, history_id, internal_date, label_ids,
+            subject, sender, recipients, snippet, body_text, raw_json
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    """, (
+        msg["id"],
+        msg["threadId"],
+        int(msg.get("historyId", 0)),
+        int(msg.get("internalDate", 0)),
+        json.dumps(msg.get("labelIds", [])),
+        headers.get("Subject"),
+        headers.get("From"),
+        headers.get("To"),
+        msg.get("snippet"),
+        body_text,
+        json.dumps(msg)
+    ))
+def full_sync(conn):
+    print("Starting full sync...")
+    service = get_gmail_service()
+    all_ids = []
+    request = service.users().messages().list(userId="me", maxResults=500)
+    while request:
+        response = gmail_call(request)
+        all_ids.extend(response.get("messages", []))
+        print(f"\rFound {len(all_ids):,} emails...", end="", flush=True)
+        request = service.users().messages().list_next(request, response)
+    total = len(all_ids)
+    print(f"\nFound {total:,} emails")
+    max_history_id = 0
+    processed = 0
+    BATCH_SIZE = 500
+    for batch_start in range(0, total, BATCH_SIZE):
+        batch = all_ids[batch_start:batch_start + BATCH_SIZE]
+        print(
+            f"\nBatch {batch_start:,}-{batch_start + len(batch):,} of {total:,}")
+        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            futures = [executor.submit(fetch_message, item["id"])
+                       for item in batch]
+            for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading"):
+                try:
+                    msg = future.result()
+                    if not msg:
+                        continue
+                    save_message(conn, msg)
+                    max_history_id = max(
+                        max_history_id, int(msg.get("historyId", 0)))
+                    processed += 1
+                    if processed % COMMIT_BATCH == 0:
+                        conn.commit()
+                except Exception as e:
+                    print(f"\nWorker error: {e}")
+        conn.commit()
+        print(f"Saved {processed:,}/{total:,}")
+    set_state(conn, "history_id", max_history_id)
+    conn.commit()
+    print(f"\nFull sync complete ({processed:,})")
+def incremental_sync(conn, last_history_id):
+    print(f"Incremental sync from history {last_history_id}")
+    service = get_gmail_service()
+    request = service.users().history().list(
+        userId="me", startHistoryId=last_history_id)
+    new_ids = []
+    newest_history = int(last_history_id)
+    while request:
+        response = gmail_call(request)
+        for item in response.get("history", []):
+            newest_history = max(newest_history, int(item["id"]))
+            for added in item.get("messagesAdded", []):
+                new_ids.append(added["message"]["id"])
+        request = service.users().history().list_next(request, response)
+    new_ids = list(set(new_ids))
+    if not new_ids:
+        print("No new emails")
+        return
+    print(f"Found {len(new_ids):,} new emails")
+    lock = Lock()
+    processed = 0
+    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        futures = {executor.submit(
+            fetch_message, msg_id): msg_id for msg_id in new_ids}
+        for future in tqdm(as_completed(futures), total=len(new_ids), desc="Fetching"):
+            msg = future.result()
+            if msg:
+                with lock:
+                    save_message(conn, msg)
+                    processed += 1
+                    if processed % COMMIT_BATCH == 0:
+                        conn.commit()
+    set_state(conn, "history_id", newest_history)
+    conn.commit()
+    print(f"Incremental sync complete ({processed:,})")

gmail_cleaner-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,121 @@
+Metadata-Version: 2.4
+Name: gmail_cleaner
+Version: 0.1.0
+Summary: A powerful local CLI tool to sync, cluster, and surgically clean your Gmail.
+Author-email: Mayank Gupta <mayankgupta690@gmail.com>
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: google-api-python-client
+Requires-Dist: google-auth-httplib2
+Requires-Dist: google-auth-oauthlib
+Requires-Dist: pandas
+Requires-Dist: scikit-learn
+Requires-Dist: tqdm
+Dynamic: license-file
+<p align="center">
+  <img src="assets/logo.png" width="220" alt="Gmail Cleaner Logo">
+</p>
+<h1 align="center">Gmail Cleaner</h1>
+<p align="center">
+  <em>Because somewhere in those 50,000 emails is a tax document you actually need.</em>
+</p>
+<p align="center">
+  <a href="https://pypi.org/project/gmail-cleaner/"><img src="https://img.shields.io/pypi/v/gmail-cleaner.svg?color=blue" alt="PyPI version"></a>
+  <a href="https://codecov.io/gh/immkg/gmail-cleaner"><img src="https://codecov.io/gh/immkg/gmail-cleaner/branch/main/graph/badge.svg" alt="Coverage"></a>
+  <a href="https://github.com/immkg/gmail-cleaner/releases/latest"><img src="https://img.shields.io/github/v/release/immkg/gmail-cleaner?style=flat-square&color=brightgreen&label=release" alt="Release"></a>
+  <img src="https://img.shields.io/badge/license-MIT-111111" alt="MIT license">
+</p>
+<p align="center">
+  <strong>Built by Mayank Gupta</strong><br>
+  <sub>Surgical precision for your inbox. ~99% less clutter &middot; ~100% more sanity &middot; 100% local.</sub>
+</p>
+---
+You have 50,000 emails. Somewhere in that mountain of newsletters, random promotions, and auto-generated alerts is a critical message from your bank that you cannot afford to lose. You can't just 'Select All -> Delete'.
+You need a surgical tool.
+With Gmail Cleaner:
+```bash
+# It just deletes them.
+python -m gmail_cleaner.main clean
+```
+Gmail Cleaner synchronizes your Gmail account into a local SQLite database, analyzes your emails using Pandas and Scikit-Learn, and interactively bulk-deletes the noise based on aggressive, local strategies.
+## Before / after
+You try to find an important bank email. Your search results are flooded with 400 "Limited Time Offer!" emails from a newsletter you never subscribed to.
+After Gmail Cleaner: You actually see your bank email.
+## Setup
+The most effort Gmail Cleaner will ever ask of you:
+### 1. Enable the Gmail API
+1. Go to the [Google Cloud Console](https://console.cloud.google.com/).
+2. Create a new project or select an existing one.
+3. Navigate to **APIs & Services > Library**.
+4. Search for "Gmail API" and click **Enable**.
+### 2. Set Up Desktop App Credentials
+1. Go to **APIs & Services > OAuth consent screen**. Choose **External** and add your own Gmail address under **Test users**.
+2. Go to **APIs & Services > Credentials**.
+3. Click **Create Credentials** > **OAuth client ID**.
+4. Choose **Desktop app**.
+5. Click **Download JSON** on the confirmation dialog.
+6. Rename the file to `credentials.json` and place it in the root folder.
+### 3. Install
+You can install directly from PyPI or download the standalone `.exe` from the latest GitHub Release.
+```bash
+pip install gmail-cleaner
+```
+## Configuration
+Before running the clean command, open `gmail_cleaner/config.py` and customize your patterns:
+- `AUTO_DELETE_EMAIL_PATTERNS`: Add email addresses or domains that you always want to delete instantly (e.g. `newsletter@spam.com`).
+- `PROTECTED_EMAIL_PATTERNS`: Add personal or banking emails that should NEVER be deleted by the tool (e.g. `@yourbank.com`).
+*Lazy, not negligent: The code ensures these patterns are scrubbed from version control, so you can safely keep this repo synced without leaking your personal contacts.*
+## Commands
+| Command | What it does |
+|---------|--------------|
+| `python -m gmail_cleaner.main sync` | Syncs your emails to a local SQLite database. |
+| `python -m gmail_cleaner.main analyze` | Analyzes emails to show top domains, senders, and TF-IDF topic clusters. |
+| `python -m gmail_cleaner.main clean` | Starts the interactive CLI to bulk delete emails based on strategies (newsletters, promotions, etc.). |
+| `python -m gmail_cleaner.main clean --dry-run` | Simulates the deletion queue and prints what *would* be deleted without touching the Gmail API. |
+*When cleaning, you can select specific email ranges (e.g., `1,2,5-10`, `k10` to keep the latest 10) and accumulate deletions before executing them in bulk by typing `yes` to skip or `now` to process the queue.*
+## FAQ
+**Does it upload my emails to a random server?**
+No. It downloads them to a local SQLite database (`gmail.db`). The TF-IDF analysis, clustering, and deleting all happen entirely on your machine. 100% local.
+**Will it accidentally delete my tax returns or bank statements?**
+Not if you tell it not to. Add `@yourbank.com` or your accountant's email to `PROTECTED_EMAIL_PATTERNS` in `config.py`. Once there, it becomes structurally impossible for the tool to delete them, even if you explicitly select them.
+**Can I undo a deletion?**
+Yes. The tool moves emails to the Gmail Trash folder; it doesn't permanently annihilate them. You have 30 days to rescue them through the standard Gmail UI before Google purges them forever.
+**What if I really need that shoe store newsletter from 2019?**
+You don't. Insist anyway and you can use the `k` command (e.g., `k10`) to keep the latest 10 and delete the rest. But it will judge you silently.
+## License
+[MIT](LICENSE). The shortest license that works.

gmail_cleaner-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+gmail_cleaner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+gmail_cleaner/analyze.py,sha256=VCH8D3SWYkYt2OdgToej21VTFwKDpyMdfUZuZpgKa0k,4152
+gmail_cleaner/auth.py,sha256=c7ySl-h6yIB-ynCgrc25UYtm3JxHSc5-LS88NNbrBCQ,1845
+gmail_cleaner/config.py,sha256=Zaua7CRRiP6lEHL86Ylh3Ayvu2ZOsBDPJOsnRcS1jAQ,790
+gmail_cleaner/db.py,sha256=ybjTN2XsdjppqDPVRJR2BFC3R4Aq8vjBJ03ASJrGcqc,1482
+gmail_cleaner/delete.py,sha256=6y-0DeX4Tp6fCLn8i6blMw_pJX0nn0xRJeB4qlsi9wk,11921
+gmail_cleaner/main.py,sha256=3K3rYGtLzNUt1EavI13K6c-5sTHcuvCiWqtinttmuDw,1763
+gmail_cleaner/sync.py,sha256=p8nv_Mh1giV5mMCF3zvl1zW76rBBOBm_WPDz_6NIvvI,5294
+gmail_cleaner-0.1.0.dist-info/licenses/LICENSE,sha256=XKKSDU9WlUEAyPNlRhq6e2xhVNpJc097JwPZJ1rUnRE,1077
+tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tests/conftest.py,sha256=DvyhPLgg4opdZqcM7DHO94fM-RurxcHhCpG8SSdO3Es,2816
+tests/test_analyze.py,sha256=WwsbsD0bLTxD9Lm2TprsLpk-8noL_fTZ3q_zDV6R9r0,2149
+tests/test_delete.py,sha256=anqQG5GefrNpMmMa50aT3spXMi0qPqdILbflm6VdBN4,3195
+gmail_cleaner-0.1.0.dist-info/METADATA,sha256=yJ9F4DcCH8eXXU_HHfJrCYL4HveSjxWuy1SiffIjYAg,5654
+gmail_cleaner-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+gmail_cleaner-0.1.0.dist-info/entry_points.txt,sha256=QWII3zikd5I12tcunQy6ZZfxVzJh2-CpGQaiw9BZdec,58
+gmail_cleaner-0.1.0.dist-info/top_level.txt,sha256=lWdsgkubIg3PgZ1xyYyXpkzWWOSYf0z8w-5ymSC-SNQ,20
+gmail_cleaner-0.1.0.dist-info/RECORD,,

gmail_cleaner-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

gmail_cleaner-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ gmail-cleaner = gmail_cleaner.main:main

gmail_cleaner-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

gmail_cleaner-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ gmail_cleaner
2	+ tests

tests/__init__.py ADDED Viewed

File without changes

tests/conftest.py ADDED Viewed

@@ -0,0 +1,50 @@
+import pytest
+import sqlite3
+import pandas as pd
+from unittest.mock import patch
+# Setup synthetic configurations for tests
+@pytest.fixture(autouse=True)
+def mock_config(monkeypatch):
+    monkeypatch.setattr("gmail_cleaner.analyze.DB_FILE", ":memory:")
+    monkeypatch.setattr("gmail_cleaner.delete.DB_FILE", ":memory:")
+    monkeypatch.setattr("gmail_cleaner.config.AUTO_DELETE_EMAIL_PATTERNS", ["spam@trash.com", "@junk.com"])
+    monkeypatch.setattr("gmail_cleaner.analyze.AUTO_DELETE_EMAIL_PATTERNS", ["spam@trash.com", "@junk.com"])
+    monkeypatch.setattr("gmail_cleaner.config.PROTECTED_EMAIL_PATTERNS", ["vip@bank.com", "@mywork.com"])
+    monkeypatch.setattr("gmail_cleaner.analyze.PROTECTED_EMAIL_PATTERNS", ["vip@bank.com", "@mywork.com"])
+@pytest.fixture
+def synthetic_df():
+    data = [
+        # Normal email
+        {"id": "msg1", "sender": "friend@gmail.com", "subject": "Lunch?", "snippet": "Hey, lunch tomorrow?", "body_text": "Let me know", "internal_date": 1000},
+        # Newsletter
+        {"id": "msg2", "sender": "daily@news.com", "subject": "Your daily newsletter", "snippet": "Read more...", "body_text": "Click here to unsubscribe.", "internal_date": 1001},
+        # Promotion
+        {"id": "msg3", "sender": "marketing@store.com", "subject": "50% Discount on Shoes!", "snippet": "Limited time offer", "body_text": "Big sale today only", "internal_date": 1002},
+        # Protected (should normally be filtered by load_data)
+        {"id": "msg4", "sender": "alerts@mywork.com", "subject": "Server Down", "snippet": "Urgent", "body_text": "", "internal_date": 1003},
+        # Auto Delete
+        {"id": "msg5", "sender": "spam@trash.com", "subject": "Win a prize", "snippet": "You won", "body_text": "Click link", "internal_date": 1004},
+        # Another from top sender
+        {"id": "msg6", "sender": "marketing@store.com", "subject": "Another coupon", "snippet": "Save more", "body_text": "Sale sale sale", "internal_date": 1005},
+        # Another newsletter
+        {"id": "msg7", "sender": "daily@news.com", "subject": "Weekly digest", "snippet": "More news...", "body_text": "Unsubscribe below.", "internal_date": 1006},
+    ]
+    df = pd.DataFrame(data)
+    from gmail_cleaner.analyze import canonical_email, extract_domain
+    df["email"] = df["sender"].apply(canonical_email)
+    df["domain"] = df["sender"].apply(extract_domain)
+    return df
+@pytest.fixture
+def mock_db(synthetic_df):
+    conn = sqlite3.connect(":memory:")
+    synthetic_df.to_sql("gmail_messages", conn, index=False)
+    # Add 'deleted' column as expected by load_data
+    conn.execute("ALTER TABLE gmail_messages ADD COLUMN deleted INTEGER NOT NULL DEFAULT 0")
+    with patch("sqlite3.connect", return_value=conn):
+        yield conn

tests/test_analyze.py ADDED Viewed

@@ -0,0 +1,61 @@
+from gmail_cleaner.analyze import (
+    extract_email,
+    extract_domain,
+    canonical_email,
+    matches_pattern,
+    matches_protected,
+    matches_auto_delete,
+    load_data
+)
+def test_extract_email():
+    assert extract_email("John Doe <john@example.com>") == "john@example.com"
+    assert extract_email("john@example.com") == "john@example.com"
+    assert extract_email(None) == ""
+def test_extract_domain():
+    assert extract_domain("John Doe <john@example.com>") == "example.com"
+    assert extract_domain("no-reply@sub.domain.com") == "sub.domain.com"
+    assert extract_domain("invalid-email") == "(unknown)"
+def test_canonical_email():
+    assert canonical_email("John Doe <john@example.com>") == "john@example.com"
+    assert canonical_email("John Doe <John@Example.COM>") == "john@example.com"
+def test_matches_pattern():
+    patterns = ["@spam.com", "exact@match.com"]
+    assert matches_pattern("user@spam.com", patterns) is True
+    assert matches_pattern("exact@match.com", patterns) is True
+    assert matches_pattern("user@notspam.com", patterns) is False
+def test_matches_protected(mock_config):
+    # From conftest mock: ["vip@bank.com", "@mywork.com"]
+    assert matches_protected("user@mywork.com") is True
+    assert matches_protected("vip@bank.com") is True
+    assert matches_protected("other@bank.com") is False
+def test_matches_auto_delete(mock_config):
+    # From conftest mock: ["spam@trash.com", "@junk.com"]
+    assert matches_auto_delete("spam@trash.com") is True
+    assert matches_auto_delete("user@junk.com") is True
+    assert matches_auto_delete("user@good.com") is False
+def test_load_data(mock_db, mock_config):
+    df = load_data()
+    # "alerts@mywork.com" should be filtered out because it matches PROTECTED_EMAIL_PATTERNS
+    emails = df["email"].tolist()
+    assert "alerts@mywork.com" not in emails
+    # "spam@trash.com" should still be there (it gets matched later by auto delete strategies)
+    assert "spam@trash.com" in emails
+    # The dataframe should have 6 rows (7 synthetic - 1 protected)
+    assert len(df) == 6

tests/test_delete.py ADDED Viewed

@@ -0,0 +1,92 @@
+from gmail_cleaner.delete import (
+    parse_selection,
+    get_newsletter_candidates,
+    get_promotion_candidates,
+    get_top_sender_candidates,
+    get_auto_delete_candidates,
+    build_pattern_deletes
+)
+def test_parse_selection_single_values():
+    assert parse_selection("1", 10) == {0}
+    assert parse_selection("1, 3, 5", 10) == {0, 2, 4}
+def test_parse_selection_ranges():
+    assert parse_selection("1-5", 10) == {0, 1, 2, 3, 4}
+    assert parse_selection("1-3, 5-6", 10) == {0, 1, 2, 4, 5}
+def test_parse_selection_out_of_bounds():
+    # 15 should be ignored as max_index is 10
+    assert parse_selection("1, 15", 10) == {0}
+    # Invalid parsing falls through gracefully or ignores
+    assert parse_selection("-5", 10) == set()
+def test_parse_selection_mixed():
+    assert parse_selection("1, 3-5, 9", 10) == {0, 2, 3, 4, 8}
+def test_get_newsletter_candidates(synthetic_df):
+    candidates = list(get_newsletter_candidates(synthetic_df))
+    # daily@news.com has 2 newsletter emails
+    assert len(candidates) == 1
+    assert candidates[0][0] == "daily@news.com"
+    assert candidates[0][1] == 2
+def test_get_promotion_candidates(synthetic_df):
+    candidates = list(get_promotion_candidates(synthetic_df))
+    # marketing@store.com has 2 promotions
+    assert len(candidates) == 1
+    assert candidates[0][0] == "marketing@store.com"
+    assert candidates[0][1] == 2
+def test_get_top_sender_candidates(synthetic_df):
+    candidates = list(get_top_sender_candidates(synthetic_df))
+    # marketing and daily both have 2
+    senders = [c[0] for c in candidates]
+    assert "marketing@store.com" in senders
+    assert "daily@news.com" in senders
+    assert "friend@gmail.com" in senders
+def test_get_auto_delete_candidates(synthetic_df, mock_config):
+    candidates = list(get_auto_delete_candidates(synthetic_df))
+    assert len(candidates) == 1
+    assert candidates[0][0] == "spam@trash.com"
+def test_build_pattern_deletes(synthetic_df):
+    sender_df = synthetic_df[synthetic_df["email"] == "marketing@store.com"]
+    # 2 emails: "50% Discount on Shoes!" (msg3) and "Another coupon" (msg6)
+    # Fake candidate list like what handle_sender creates
+    # [(normalized_subject, count)]
+    candidates = [
+        ("another coupon", 1),
+        ("<num>% discount on shoes!", 1)
+    ]
+    # Test selecting the first pattern ("another coupon")
+    ids = build_pattern_deletes(sender_df, candidates, "1")
+    assert len(ids) == 1
+    assert ids[0] == "msg6"
+    # Test selecting both patterns ("1, 2")
+    ids = build_pattern_deletes(sender_df, candidates, "1, 2")
+    assert len(ids) == 2
+    assert "msg3" in ids
+    assert "msg6" in ids
+    # Test "keep 1" (k1) -> keeps the newest (msg6 which has internal_date 1005), deletes msg3 (1002)
+    ids = build_pattern_deletes(sender_df, candidates, "k1")
+    assert len(ids) == 1
+    assert ids[0] == "msg3"
+    # Test "delete all" (d) -> wait, "d" means don't delete any in the current logic. Let's check `if selection == "d": return []`
+    ids = build_pattern_deletes(sender_df, candidates, "d")
+    assert len(ids) == 0