gmail-cleaner 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,131 @@
1
+ import sqlite3
2
+ import re
3
+ from collections import Counter
4
+ from email.utils import parseaddr
5
+
6
+ import pandas as pd
7
+ from sklearn.cluster import MiniBatchKMeans
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.feature_extraction import text as sklearn_text
10
+
11
+ from gmail_cleaner.config import DB_FILE, AUTO_DELETE_EMAIL_PATTERNS, PROTECTED_EMAIL_PATTERNS
12
+
13
+
14
+ def matches_pattern(email, patterns):
15
+ email = (email or "").lower()
16
+ for pattern in patterns:
17
+ pattern = pattern.lower()
18
+ if pattern.startswith("@"):
19
+ domain = pattern[1:]
20
+ if email.endswith("@" + domain) or email.endswith("." + domain):
21
+ return True
22
+ elif email == pattern:
23
+ return True
24
+ return False
25
+
26
+
27
+ def extract_email(sender):
28
+ return parseaddr(sender or "")[1].lower()
29
+
30
+
31
+ def extract_domain(sender):
32
+ email = extract_email(sender)
33
+ if "@" not in email:
34
+ return "(unknown)"
35
+ return email.split("@", 1)[1].lower()
36
+
37
+
38
+ def canonical_email(sender):
39
+ email = extract_email(sender)
40
+ return email.lower() if email else ""
41
+
42
+
43
+ def matches_auto_delete(email):
44
+ return matches_pattern(email, AUTO_DELETE_EMAIL_PATTERNS)
45
+
46
+
47
+ def matches_protected(email):
48
+ return matches_pattern(email, PROTECTED_EMAIL_PATTERNS)
49
+
50
+
51
+ def load_data():
52
+ conn = sqlite3.connect(DB_FILE)
53
+ df = pd.read_sql(
54
+ """
55
+ SELECT id, sender, subject, internal_date, snippet, body_text, deleted
56
+ FROM gmail_messages
57
+ WHERE COALESCE(deleted, 0) = 0
58
+ """,
59
+ conn,
60
+ )
61
+ conn.close()
62
+
63
+ df["email"] = df["sender"].fillna("").apply(canonical_email)
64
+ df = df[~df["email"].apply(matches_protected)]
65
+ df["domain"] = df["sender"].fillna("").apply(extract_domain)
66
+
67
+ return df
68
+
69
+
70
+ def run_analysis():
71
+ df = load_data()
72
+
73
+ print("\n" + "=" * 100)
74
+ print(f"EMAILS: {len(df):,}")
75
+ print("=" * 100)
76
+
77
+ # --------------------------------------------------
78
+ # TOP DOMAINS
79
+ # --------------------------------------------------
80
+ print("\nTOP DOMAINS")
81
+ print("-" * 100)
82
+ for domain, count in df["domain"].value_counts().head(50).items():
83
+ print(f"{count:8,d} {domain}")
84
+
85
+ # --------------------------------------------------
86
+ # TOP SENDERS
87
+ # --------------------------------------------------
88
+ print("\nTOP SENDERS")
89
+ print("-" * 100)
90
+ for sender, count in df["email"].value_counts().head(100).items():
91
+ print(f"{count:8,d} {sender}")
92
+
93
+ # --------------------------------------------------
94
+ # SUBJECT WORD ANALYSIS
95
+ # --------------------------------------------------
96
+ subject_words = []
97
+ for subject in df["subject"].fillna(""):
98
+ tokens = re.findall(r"[a-zA-Z]{4,}", subject.lower())
99
+ subject_words.extend(tokens)
100
+
101
+ print("\nTOP SUBJECT WORDS")
102
+ print("-" * 100)
103
+ for word, count in Counter(subject_words).most_common(50):
104
+ print(f"{count:8,d} {word}")
105
+
106
+ # --------------------------------------------------
107
+ # TF-IDF TOPICS
108
+ # --------------------------------------------------
109
+ print("\nTOPIC CLUSTERS")
110
+ print("-" * 100)
111
+ documents = df["subject"].fillna("") + " " + df["snippet"].fillna("")
112
+ vectorizer = TfidfVectorizer(
113
+ stop_words="english", max_features=5000, min_df=5)
114
+
115
+ if len(documents) > 0:
116
+ X = vectorizer.fit_transform(documents)
117
+ cluster_count = min(20, max(2, len(df) // 100))
118
+ model = MiniBatchKMeans(n_clusters=cluster_count,
119
+ random_state=42, batch_size=2048)
120
+ model.fit(X)
121
+ terms = vectorizer.get_feature_names_out()
122
+
123
+ for cluster_id in range(cluster_count):
124
+ center = model.cluster_centers_[cluster_id]
125
+ top_indices = center.argsort()[-12:][::-1]
126
+ keywords = [terms[i] for i in top_indices]
127
+ size = (model.labels_ == cluster_id).sum()
128
+ print(f"\nCluster {cluster_id + 1} ({size:,} emails)")
129
+ print(", ".join(keywords))
130
+
131
+ print("\nDone.")
gmail_cleaner/auth.py ADDED
@@ -0,0 +1,57 @@
1
+ import time
2
+ import random
3
+ from pathlib import Path
4
+
5
+ from google.oauth2.credentials import Credentials
6
+ from google.auth.transport.requests import Request
7
+ from google_auth_oauthlib.flow import InstalledAppFlow
8
+ from googleapiclient.discovery import build
9
+ from googleapiclient.errors import HttpError
10
+
11
+ from gmail_cleaner.config import SCOPES, TOKEN_FILE, CREDS_FILE, MAX_RETRIES
12
+
13
+
14
+ def get_gmail_service():
15
+ creds = None
16
+
17
+ if Path(TOKEN_FILE).exists():
18
+ creds = Credentials.from_authorized_user_file(TOKEN_FILE, SCOPES)
19
+
20
+ if not creds or not creds.valid:
21
+ if creds and creds.expired and creds.refresh_token:
22
+ creds.refresh(Request())
23
+ else:
24
+ flow = InstalledAppFlow.from_client_secrets_file(
25
+ CREDS_FILE, SCOPES)
26
+ creds = flow.run_local_server(port=0)
27
+
28
+ with open(TOKEN_FILE, "w") as f:
29
+ f.write(creds.to_json())
30
+
31
+ return build("gmail", "v1", credentials=creds, cache_discovery=False)
32
+
33
+
34
+ def gmail_call(request):
35
+ for attempt in range(MAX_RETRIES):
36
+ try:
37
+ result = request.execute(num_retries=3)
38
+ return result
39
+ except HttpError as e:
40
+ status = getattr(e.resp, "status", None)
41
+ if status in (429, 500, 502, 503, 504):
42
+ wait = min(60, 2 ** attempt) + random.random()
43
+ print(
44
+ f"\nHTTP {status} retry {attempt+1}/{MAX_RETRIES} wait={wait:.1f}s")
45
+ time.sleep(wait)
46
+ continue
47
+ raise
48
+ raise RuntimeError("Max retries exceeded")
49
+
50
+
51
+ def trash_email(gmail_id):
52
+ service = get_gmail_service()
53
+ try:
54
+ service.users().messages().trash(userId="me", id=gmail_id).execute()
55
+ return True
56
+ except Exception as e:
57
+ return str(e)
@@ -0,0 +1,28 @@
1
+ # config.py
2
+
3
+ DB_FILE = "gmail.db"
4
+ CREDS_FILE = "credentials.json"
5
+ TOKEN_FILE = "token.json"
6
+
7
+ SCOPES = [
8
+ "https://www.googleapis.com/auth/gmail.modify"
9
+ ]
10
+
11
+ MAX_WORKERS = 20
12
+ MAX_RETRIES = 8
13
+ COMMIT_BATCH = 100
14
+
15
+ # --------------------------------------------------
16
+ # CONFIGURABLE PATTERNS
17
+ # --------------------------------------------------
18
+ # Add email addresses or domains you want to automatically categorize for deletion here.
19
+ # Example: "newsletter@example.com", "@spamdomain.com"
20
+ AUTO_DELETE_EMAIL_PATTERNS = [
21
+ # Add your auto delete patterns here
22
+ ]
23
+
24
+ # Add email addresses or domains you want to protect from accidental deletion.
25
+ # Example: "personal@gmail.com", "@mybank.com"
26
+ PROTECTED_EMAIL_PATTERNS = [
27
+ # Add your protected email patterns here
28
+ ]
gmail_cleaner/db.py ADDED
@@ -0,0 +1,68 @@
1
+ import sqlite3
2
+ from gmail_cleaner.config import DB_FILE
3
+
4
+
5
+ def init_db():
6
+ conn = sqlite3.connect(
7
+ DB_FILE,
8
+ check_same_thread=False
9
+ )
10
+
11
+ conn.execute("""
12
+ CREATE TABLE IF NOT EXISTS gmail_messages (
13
+ id TEXT PRIMARY KEY,
14
+ thread_id TEXT,
15
+ history_id INTEGER,
16
+ internal_date INTEGER,
17
+ label_ids TEXT,
18
+ subject TEXT,
19
+ sender TEXT,
20
+ recipients TEXT,
21
+ snippet TEXT,
22
+ body_text TEXT,
23
+ raw_json TEXT,
24
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP
25
+ )
26
+ """)
27
+
28
+ conn.execute("""
29
+ CREATE TABLE IF NOT EXISTS sync_state (
30
+ key TEXT PRIMARY KEY,
31
+ value TEXT
32
+ )
33
+ """)
34
+
35
+ conn.execute("""
36
+ CREATE INDEX IF NOT EXISTS idx_internal_date
37
+ ON gmail_messages(internal_date)
38
+ """)
39
+
40
+ # From analyze.py
41
+ try:
42
+ conn.execute("""
43
+ ALTER TABLE gmail_messages
44
+ ADD COLUMN deleted INTEGER NOT NULL DEFAULT 0
45
+ """)
46
+ except sqlite3.OperationalError:
47
+ pass
48
+
49
+ conn.commit()
50
+ return conn
51
+
52
+
53
+ def get_state(conn, key):
54
+ row = conn.execute(
55
+ "SELECT value FROM sync_state WHERE key=?",
56
+ (key,)
57
+ ).fetchone()
58
+ return row[0] if row else None
59
+
60
+
61
+ def set_state(conn, key, value):
62
+ conn.execute(
63
+ """
64
+ INSERT OR REPLACE INTO sync_state(key,value)
65
+ VALUES (?,?)
66
+ """,
67
+ (key, str(value))
68
+ )
@@ -0,0 +1,341 @@
1
+ import re
2
+ import sqlite3
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from collections import Counter
5
+ from tqdm import tqdm
6
+
7
+ from gmail_cleaner.config import DB_FILE
8
+ from gmail_cleaner.auth import trash_email
9
+ from gmail_cleaner.analyze import load_data, matches_auto_delete
10
+ from sklearn.feature_extraction import text as sklearn_text
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
+ from sklearn.cluster import MiniBatchKMeans
13
+
14
+ DELETE_WORKERS = 10
15
+ PROMOTION_REGEX = r"sale|offer|discount|coupon|deal|cashback|save|limited time"
16
+ NEWSLETTER_REGEX = r"unsubscribe|newsletter|digest|weekly|daily"
17
+ DELETE_STRATEGIES = {
18
+ "1": "top_senders",
19
+ "2": "newsletter",
20
+ "3": "promotions",
21
+ "4": "subject_patterns",
22
+ "5": "topic_clusters",
23
+ "6": "auto_delete_matches",
24
+ }
25
+
26
+ pending_delete_ids = []
27
+
28
+
29
+ def normalize_subject(subject):
30
+ if not subject:
31
+ return ""
32
+ subject = subject.lower()
33
+ subject = re.sub(r"\d+", "<num>", subject)
34
+ return re.sub(r"\s+", " ", subject).strip()
35
+
36
+
37
+ def choose_delete_strategy():
38
+ print("\nDELETE STRATEGIES")
39
+ print("-" * 100)
40
+ print("[1] Top Senders")
41
+ print("[2] Newsletters")
42
+ print("[3] Promotions")
43
+ print("[4] Subject Patterns")
44
+ print("[5] Topic Clusters")
45
+ print("[6] Auto Delete Matches")
46
+ print("[q] Quit")
47
+ while True:
48
+ choice = input("\nStrategy: ").strip().lower()
49
+ if choice == "q":
50
+ return None
51
+ if choice in DELETE_STRATEGIES:
52
+ return DELETE_STRATEGIES[choice]
53
+ print("Invalid choice")
54
+
55
+ # Strategy Builders
56
+
57
+
58
+ def get_top_sender_candidates(df):
59
+ return df["email"].value_counts().head(100).items()
60
+
61
+
62
+ def get_newsletter_candidates(df):
63
+ newsletter_df = df[
64
+ (df["subject"].fillna("").str.contains(NEWSLETTER_REGEX, case=False, regex=True)) |
65
+ (df["body_text"].fillna("").str.contains(
66
+ NEWSLETTER_REGEX, case=False, regex=True))
67
+ ]
68
+ return newsletter_df["email"].value_counts().head(100).items()
69
+
70
+
71
+ def get_promotion_candidates(df):
72
+ promo_df = df[
73
+ (df["subject"].fillna("").str.contains(PROMOTION_REGEX, case=False, regex=True)) |
74
+ (df["snippet"].fillna("").str.contains(
75
+ PROMOTION_REGEX, case=False, regex=True))
76
+ ]
77
+ return promo_df["email"].value_counts().head(100).items()
78
+
79
+
80
+ def get_subject_pattern_candidates(df):
81
+ patterns = Counter()
82
+ for subject in df["subject"].fillna(""):
83
+ patterns[normalize_subject(subject)] += 1
84
+ return patterns.most_common(100)
85
+
86
+
87
+ def get_cluster_candidates(df):
88
+ CUSTOM_STOPWORDS = {"email", "dear", "thank", "thanks",
89
+ "hi", "hello", "com", "www", "http", "https"}
90
+ documents = df["subject"].fillna("")
91
+ stop_words = sklearn_text.ENGLISH_STOP_WORDS.union(CUSTOM_STOPWORDS)
92
+ vectorizer = TfidfVectorizer(stop_words=list(
93
+ stop_words), max_features=10000, min_df=3, ngram_range=(1, 2))
94
+
95
+ if len(documents) == 0:
96
+ return []
97
+
98
+ X = vectorizer.fit_transform(documents)
99
+ cluster_count = min(100, max(20, len(df) // 20))
100
+ model = MiniBatchKMeans(n_clusters=cluster_count,
101
+ random_state=42, batch_size=2048)
102
+ model.fit(X)
103
+ terms = vectorizer.get_feature_names_out()
104
+
105
+ clusters = []
106
+ for cluster_id in range(cluster_count):
107
+ center = model.cluster_centers_[cluster_id]
108
+ top_indices = center.argsort()[-5:][::-1]
109
+ keywords = ", ".join(terms[i] for i in top_indices)
110
+ size = (model.labels_ == cluster_id).sum()
111
+ clusters.append((cluster_id, size, keywords, model.labels_))
112
+ return clusters
113
+
114
+
115
+ def get_auto_delete_candidates(df):
116
+ auto_df = df[df["email"].apply(matches_auto_delete)]
117
+ return auto_df["email"].value_counts().items()
118
+
119
+ # Selection Parsing
120
+
121
+
122
+ def parse_selection(selection, max_index):
123
+ indexes = set()
124
+ for part in selection.split(","):
125
+ part = part.strip()
126
+ if not part:
127
+ continue
128
+ try:
129
+ if "-" in part:
130
+ start, end = part.split("-", 1)
131
+ if not start or not end:
132
+ continue
133
+ for i in range(int(start), int(end) + 1):
134
+ if 1 <= i <= max_index:
135
+ indexes.add(i - 1)
136
+ else:
137
+ value = int(part)
138
+ if 1 <= value <= max_index:
139
+ indexes.add(value - 1)
140
+ except ValueError:
141
+ pass
142
+ return indexes
143
+
144
+
145
+ def build_pattern_deletes(sender_df, candidates, selection):
146
+ if selection == "q":
147
+ raise KeyboardInterrupt()
148
+ if selection == "d":
149
+ return []
150
+ if selection.startswith("k") or selection == "p":
151
+ keep = int(selection[1:] or "0")
152
+ sender_df = sender_df.sort_values("internal_date", ascending=False)
153
+ return sender_df.iloc[keep:]["id"].dropna().tolist()
154
+
155
+ try:
156
+ indexes = parse_selection(selection, len(candidates))
157
+ except Exception:
158
+ print("Invalid selection")
159
+ return []
160
+
161
+ delete_patterns = [candidates[idx][0] for idx in indexes]
162
+ delete_ids = []
163
+ for _, row in sender_df.iterrows():
164
+ subject = normalize_subject(row["subject"])
165
+ if subject in delete_patterns:
166
+ delete_ids.append(row["id"])
167
+ return delete_ids
168
+
169
+ # Handlers
170
+
171
+
172
+ def handle_sender(sender, sender_count, df):
173
+ sender_df = df[df["email"] == sender]
174
+ patterns = Counter()
175
+ for subject in sender_df["subject"].fillna(""):
176
+ patterns[normalize_subject(subject)] += 1
177
+ candidates = list(patterns.most_common(20))
178
+
179
+ print(f"\n{sender} ({sender_count:,})")
180
+ for i, (pattern, count) in enumerate(candidates, start=1):
181
+ print(f"[{i}] {count:,} {pattern[:120]}")
182
+
183
+ selection = input("\nChoice: ").strip()
184
+ return build_pattern_deletes(sender_df, candidates, selection)
185
+
186
+
187
+ def handle_newsletter(sender, sender_count, df):
188
+ sender_df = df[df["email"] == sender]
189
+ print(f"\nNEWSLETTER: {sender}\nEmails: {sender_count:,}")
190
+ latest = sender_df.sort_values("internal_date", ascending=False).head(10)
191
+ for _, row in latest.iterrows():
192
+ print(row["subject"][:120])
193
+ choice = input("\nDelete all? [y/N]: ")
194
+ if choice.lower() != 'y':
195
+ return []
196
+ return sender_df["id"].tolist()
197
+
198
+
199
+ def handle_promotion(sender, sender_count, df):
200
+ sender_df = df[df["email"] == sender]
201
+ score = sender_df["subject"].fillna("").str.contains(
202
+ PROMOTION_REGEX, case=False, regex=True).mean()
203
+ print(f"\n{sender}\nEmails : {sender_count:,}\nPromo Score: {score:.2f}")
204
+ choice = input("\nDelete all? [y/N]: ")
205
+ if choice.lower() != 'y':
206
+ return []
207
+ return sender_df["id"].tolist()
208
+
209
+
210
+ def handle_subject_pattern(pattern, count, df):
211
+ print(f"\nPATTERN\n{count:,} emails\n{pattern}")
212
+ samples = df[df["subject"].fillna("").apply(
213
+ normalize_subject) == pattern]["subject"].head(10)
214
+ print("\nSamples")
215
+ for s in samples:
216
+ print(f" - {s}")
217
+ choice = input("\nDelete all? [y/N]: ")
218
+ if choice.lower() != 'y':
219
+ return []
220
+ return df[df["subject"].fillna("").apply(normalize_subject) == pattern]["id"].tolist()
221
+
222
+
223
+ def handle_cluster(cluster_id, size, keywords, labels, df):
224
+ print(
225
+ f"\nCLUSTER {cluster_id + 1}\nEmails: {size:,}\nKeywords: {keywords}")
226
+ choice = input("\nDelete cluster? [y/N]: ")
227
+ if choice.lower() != 'y':
228
+ return []
229
+ return df[labels == cluster_id]["id"].tolist()
230
+
231
+
232
+ def handle_auto_delete(sender, sender_count, df):
233
+ sender_df = df[df["email"] == sender]
234
+ print(f"\nAUTO RULE MATCH\n{sender}\n{sender_count:,} emails")
235
+ latest = sender_df["subject"].dropna().head(10)
236
+ for s in latest:
237
+ print(f" - {s}")
238
+ choice = input("\nDelete all? [Y/n]: ")
239
+ if choice.lower() == 'n':
240
+ return []
241
+ return sender_df["id"].tolist()
242
+
243
+
244
+ def delete_gmail(ids, dry_run=False):
245
+ ids_to_delete = list(set(ids))
246
+ if not ids_to_delete:
247
+ return []
248
+
249
+ print(f"\nFinal delete pass: {len(ids_to_delete):,} emails")
250
+ if dry_run:
251
+ print("[DRY RUN] Would delete the following IDs:")
252
+ for gid in ids_to_delete:
253
+ print(f" - {gid}")
254
+ return []
255
+
256
+ success = 0
257
+ failed = 0
258
+ successful_ids = []
259
+
260
+ with ThreadPoolExecutor(max_workers=DELETE_WORKERS) as executor:
261
+ future_to_id = {executor.submit(
262
+ trash_email, gid): gid for gid in ids_to_delete}
263
+ for future in tqdm(as_completed(future_to_id), total=len(future_to_id), desc="Deleting"):
264
+ gmail_id = future_to_id[future]
265
+ if future.result() is True:
266
+ success += 1
267
+ successful_ids.append(gmail_id)
268
+ else:
269
+ failed += 1
270
+
271
+ if successful_ids:
272
+ db_conn = sqlite3.connect(DB_FILE)
273
+ db_conn.executemany("UPDATE gmail_messages SET deleted = 1 WHERE id = ?", [
274
+ (gid,) for gid in successful_ids])
275
+ db_conn.commit()
276
+ db_conn.close()
277
+
278
+ print(f"\nDeleted: {success:,}")
279
+ print(f"Failed : {failed:,}")
280
+ return [x for x in ids_to_delete if x not in successful_ids]
281
+
282
+
283
+ def handle_delete(delete_ids, skip=False, dry_run=False):
284
+ global pending_delete_ids
285
+ pending_delete_ids.extend(delete_ids)
286
+ print(f"\nPending delete queue: {len(pending_delete_ids):,} emails")
287
+
288
+ if not skip:
289
+ action = input("\n[yes | now | exit] : ").strip().lower()
290
+ if action == "" or action == "yes" or action == "y":
291
+ return
292
+ if action == "exit":
293
+ return "exit"
294
+ if action == "now":
295
+ pending_delete_ids = delete_gmail(
296
+ pending_delete_ids, dry_run=dry_run)
297
+
298
+
299
+ def run_delete_flow(dry_run=False):
300
+ global pending_delete_ids
301
+ df = load_data()
302
+ strategy = choose_delete_strategy()
303
+ if not strategy:
304
+ return
305
+
306
+ try:
307
+ if strategy == "top_senders":
308
+ for sender, count in get_top_sender_candidates(df):
309
+ ids = handle_sender(sender, count, df)
310
+ if handle_delete(ids, True, dry_run=dry_run) == "exit":
311
+ break
312
+ elif strategy == "newsletter":
313
+ for sender, count in get_newsletter_candidates(df):
314
+ ids = handle_newsletter(sender, count, df)
315
+ if handle_delete(ids, dry_run=dry_run) == "exit":
316
+ break
317
+ elif strategy == "promotions":
318
+ for sender, count in get_promotion_candidates(df):
319
+ ids = handle_promotion(sender, count, df)
320
+ if handle_delete(ids, dry_run=dry_run) == "exit":
321
+ break
322
+ elif strategy == "subject_patterns":
323
+ for pattern, count in get_subject_pattern_candidates(df):
324
+ ids = handle_subject_pattern(pattern, count, df)
325
+ if handle_delete(ids, dry_run=dry_run) == "exit":
326
+ break
327
+ elif strategy == "topic_clusters":
328
+ for cluster_id, size, keywords, labels in get_cluster_candidates(df):
329
+ ids = handle_cluster(cluster_id, size, keywords, labels, df)
330
+ if handle_delete(ids, dry_run=dry_run) == "exit":
331
+ break
332
+ elif strategy == "auto_delete_matches":
333
+ for sender, count in get_auto_delete_candidates(df):
334
+ ids = handle_auto_delete(sender, count, df)
335
+ if handle_delete(ids, dry_run=dry_run) == "exit":
336
+ break
337
+ except KeyboardInterrupt:
338
+ print("\nExiting interactive loop.")
339
+
340
+ if pending_delete_ids:
341
+ delete_gmail(pending_delete_ids, dry_run=dry_run)
gmail_cleaner/main.py ADDED
@@ -0,0 +1,60 @@
1
+ import argparse
2
+
3
+ from gmail_cleaner.db import init_db, get_state
4
+ from gmail_cleaner.auth import get_gmail_service
5
+ from gmail_cleaner.sync import full_sync, incremental_sync
6
+ from gmail_cleaner.analyze import run_analysis
7
+ from gmail_cleaner.delete import run_delete_flow
8
+
9
+
10
+ def sync_command(args):
11
+ conn = init_db()
12
+ history_id = get_state(conn, "history_id")
13
+
14
+ if history_id is None:
15
+ full_sync(conn)
16
+ else:
17
+ incremental_sync(conn, history_id)
18
+
19
+ total = conn.execute("SELECT COUNT(*) FROM gmail_messages").fetchone()[0]
20
+ print(f"\nDatabase now contains {total:,} emails")
21
+ conn.close()
22
+
23
+
24
+ def analyze_command(args):
25
+ run_analysis()
26
+
27
+
28
+ def clean_command(args):
29
+ run_delete_flow(dry_run=args.dry_run)
30
+
31
+
32
+ def main():
33
+ parser = argparse.ArgumentParser(
34
+ description="Gmail Sync and Analysis Tool")
35
+ subparsers = parser.add_subparsers(title="commands", dest="command")
36
+ subparsers.required = True
37
+
38
+ # Sync
39
+ parser_sync = subparsers.add_parser(
40
+ "sync", help="Synchronize emails from Gmail to local DB")
41
+ parser_sync.set_defaults(func=sync_command)
42
+
43
+ # Analyze
44
+ parser_analyze = subparsers.add_parser(
45
+ "analyze", help="Analyze downloaded emails and show statistics")
46
+ parser_analyze.set_defaults(func=analyze_command)
47
+
48
+ # Clean
49
+ parser_clean = subparsers.add_parser(
50
+ "clean", help="Interactive CLI to bulk delete emails")
51
+ parser_clean.add_argument("--dry-run", action="store_true",
52
+ help="Simulate deletion without calling Gmail API")
53
+ parser_clean.set_defaults(func=clean_command)
54
+
55
+ args = parser.parse_args()
56
+ args.func(args)
57
+
58
+
59
+ if __name__ == "__main__":
60
+ main()
gmail_cleaner/sync.py ADDED
@@ -0,0 +1,160 @@
1
+ import json
2
+ import base64
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from threading import Lock
5
+ from tqdm import tqdm
6
+
7
+ from gmail_cleaner.config import MAX_WORKERS, COMMIT_BATCH
8
+ from gmail_cleaner.db import set_state
9
+ from gmail_cleaner.auth import gmail_call, get_gmail_service
10
+
11
+
12
+ def decode_body(data):
13
+ if not data:
14
+ return ""
15
+ try:
16
+ return base64.urlsafe_b64decode(data.encode("UTF-8")).decode("utf-8", errors="ignore")
17
+ except Exception:
18
+ return ""
19
+
20
+
21
+ def extract_text(payload):
22
+ body = payload.get("body", {}).get("data")
23
+ if body:
24
+ return decode_body(body)
25
+
26
+ for part in payload.get("parts", []):
27
+ mime = part.get("mimeType")
28
+ if mime == "text/plain":
29
+ return decode_body(part.get("body", {}).get("data"))
30
+
31
+ return ""
32
+
33
+
34
+ def fetch_message(message_id):
35
+ try:
36
+ service = get_gmail_service()
37
+ request = service.users().messages().get(
38
+ userId="me", id=message_id, format="metadata")
39
+ return gmail_call(request)
40
+ except Exception as e:
41
+ print(f"\nFAILED {message_id}: {e}")
42
+ return None
43
+
44
+
45
+ def save_message(conn, msg):
46
+ headers = {h["name"]: h["value"]
47
+ for h in msg["payload"].get("headers", [])}
48
+ body_text = extract_text(msg["payload"])
49
+
50
+ conn.execute("""
51
+ INSERT OR REPLACE INTO gmail_messages (
52
+ id, thread_id, history_id, internal_date, label_ids,
53
+ subject, sender, recipients, snippet, body_text, raw_json
54
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
55
+ """, (
56
+ msg["id"],
57
+ msg["threadId"],
58
+ int(msg.get("historyId", 0)),
59
+ int(msg.get("internalDate", 0)),
60
+ json.dumps(msg.get("labelIds", [])),
61
+ headers.get("Subject"),
62
+ headers.get("From"),
63
+ headers.get("To"),
64
+ msg.get("snippet"),
65
+ body_text,
66
+ json.dumps(msg)
67
+ ))
68
+
69
+
70
+ def full_sync(conn):
71
+ print("Starting full sync...")
72
+ service = get_gmail_service()
73
+ all_ids = []
74
+
75
+ request = service.users().messages().list(userId="me", maxResults=500)
76
+ while request:
77
+ response = gmail_call(request)
78
+ all_ids.extend(response.get("messages", []))
79
+ print(f"\rFound {len(all_ids):,} emails...", end="", flush=True)
80
+ request = service.users().messages().list_next(request, response)
81
+
82
+ total = len(all_ids)
83
+ print(f"\nFound {total:,} emails")
84
+
85
+ max_history_id = 0
86
+ processed = 0
87
+ BATCH_SIZE = 500
88
+
89
+ for batch_start in range(0, total, BATCH_SIZE):
90
+ batch = all_ids[batch_start:batch_start + BATCH_SIZE]
91
+ print(
92
+ f"\nBatch {batch_start:,}-{batch_start + len(batch):,} of {total:,}")
93
+
94
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
95
+ futures = [executor.submit(fetch_message, item["id"])
96
+ for item in batch]
97
+ for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading"):
98
+ try:
99
+ msg = future.result()
100
+ if not msg:
101
+ continue
102
+ save_message(conn, msg)
103
+ max_history_id = max(
104
+ max_history_id, int(msg.get("historyId", 0)))
105
+ processed += 1
106
+ if processed % COMMIT_BATCH == 0:
107
+ conn.commit()
108
+ except Exception as e:
109
+ print(f"\nWorker error: {e}")
110
+
111
+ conn.commit()
112
+ print(f"Saved {processed:,}/{total:,}")
113
+
114
+ set_state(conn, "history_id", max_history_id)
115
+ conn.commit()
116
+ print(f"\nFull sync complete ({processed:,})")
117
+
118
+
119
+ def incremental_sync(conn, last_history_id):
120
+ print(f"Incremental sync from history {last_history_id}")
121
+ service = get_gmail_service()
122
+ request = service.users().history().list(
123
+ userId="me", startHistoryId=last_history_id)
124
+
125
+ new_ids = []
126
+ newest_history = int(last_history_id)
127
+
128
+ while request:
129
+ response = gmail_call(request)
130
+ for item in response.get("history", []):
131
+ newest_history = max(newest_history, int(item["id"]))
132
+ for added in item.get("messagesAdded", []):
133
+ new_ids.append(added["message"]["id"])
134
+ request = service.users().history().list_next(request, response)
135
+
136
+ new_ids = list(set(new_ids))
137
+
138
+ if not new_ids:
139
+ print("No new emails")
140
+ return
141
+
142
+ print(f"Found {len(new_ids):,} new emails")
143
+ lock = Lock()
144
+ processed = 0
145
+
146
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
147
+ futures = {executor.submit(
148
+ fetch_message, msg_id): msg_id for msg_id in new_ids}
149
+ for future in tqdm(as_completed(futures), total=len(new_ids), desc="Fetching"):
150
+ msg = future.result()
151
+ if msg:
152
+ with lock:
153
+ save_message(conn, msg)
154
+ processed += 1
155
+ if processed % COMMIT_BATCH == 0:
156
+ conn.commit()
157
+
158
+ set_state(conn, "history_id", newest_history)
159
+ conn.commit()
160
+ print(f"Incremental sync complete ({processed:,})")
@@ -0,0 +1,121 @@
1
+ Metadata-Version: 2.4
2
+ Name: gmail_cleaner
3
+ Version: 0.1.0
4
+ Summary: A powerful local CLI tool to sync, cluster, and surgically clean your Gmail.
5
+ Author-email: Mayank Gupta <mayankgupta690@gmail.com>
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: google-api-python-client
10
+ Requires-Dist: google-auth-httplib2
11
+ Requires-Dist: google-auth-oauthlib
12
+ Requires-Dist: pandas
13
+ Requires-Dist: scikit-learn
14
+ Requires-Dist: tqdm
15
+ Dynamic: license-file
16
+
17
+ <p align="center">
18
+ <img src="assets/logo.png" width="220" alt="Gmail Cleaner Logo">
19
+ </p>
20
+
21
+ <h1 align="center">Gmail Cleaner</h1>
22
+
23
+ <p align="center">
24
+ <em>Because somewhere in those 50,000 emails is a tax document you actually need.</em>
25
+ </p>
26
+
27
+ <p align="center">
28
+ <a href="https://pypi.org/project/gmail-cleaner/"><img src="https://img.shields.io/pypi/v/gmail-cleaner.svg?color=blue" alt="PyPI version"></a>
29
+ <a href="https://codecov.io/gh/immkg/gmail-cleaner"><img src="https://codecov.io/gh/immkg/gmail-cleaner/branch/main/graph/badge.svg" alt="Coverage"></a>
30
+ <a href="https://github.com/immkg/gmail-cleaner/releases/latest"><img src="https://img.shields.io/github/v/release/immkg/gmail-cleaner?style=flat-square&color=brightgreen&label=release" alt="Release"></a>
31
+ <img src="https://img.shields.io/badge/license-MIT-111111" alt="MIT license">
32
+ </p>
33
+
34
+ <p align="center">
35
+ <strong>Built by Mayank Gupta</strong><br>
36
+ <sub>Surgical precision for your inbox. ~99% less clutter &middot; ~100% more sanity &middot; 100% local.</sub>
37
+ </p>
38
+
39
+ ---
40
+
41
+ You have 50,000 emails. Somewhere in that mountain of newsletters, random promotions, and auto-generated alerts is a critical message from your bank that you cannot afford to lose. You can't just 'Select All -> Delete'.
42
+
43
+ You need a surgical tool.
44
+
45
+ With Gmail Cleaner:
46
+
47
+ ```bash
48
+ # It just deletes them.
49
+ python -m gmail_cleaner.main clean
50
+ ```
51
+
52
+ Gmail Cleaner synchronizes your Gmail account into a local SQLite database, analyzes your emails using Pandas and Scikit-Learn, and interactively bulk-deletes the noise based on aggressive, local strategies.
53
+
54
+ ## Before / after
55
+
56
+ You try to find an important bank email. Your search results are flooded with 400 "Limited Time Offer!" emails from a newsletter you never subscribed to.
57
+
58
+ After Gmail Cleaner: You actually see your bank email.
59
+
60
+ ## Setup
61
+
62
+ The most effort Gmail Cleaner will ever ask of you:
63
+
64
+ ### 1. Enable the Gmail API
65
+ 1. Go to the [Google Cloud Console](https://console.cloud.google.com/).
66
+ 2. Create a new project or select an existing one.
67
+ 3. Navigate to **APIs & Services > Library**.
68
+ 4. Search for "Gmail API" and click **Enable**.
69
+
70
+ ### 2. Set Up Desktop App Credentials
71
+ 1. Go to **APIs & Services > OAuth consent screen**. Choose **External** and add your own Gmail address under **Test users**.
72
+ 2. Go to **APIs & Services > Credentials**.
73
+ 3. Click **Create Credentials** > **OAuth client ID**.
74
+ 4. Choose **Desktop app**.
75
+ 5. Click **Download JSON** on the confirmation dialog.
76
+ 6. Rename the file to `credentials.json` and place it in the root folder.
77
+
78
+ ### 3. Install
79
+ You can install directly from PyPI or download the standalone `.exe` from the latest GitHub Release.
80
+
81
+ ```bash
82
+ pip install gmail-cleaner
83
+ ```
84
+
85
+ ## Configuration
86
+
87
+ Before running the clean command, open `gmail_cleaner/config.py` and customize your patterns:
88
+
89
+ - `AUTO_DELETE_EMAIL_PATTERNS`: Add email addresses or domains that you always want to delete instantly (e.g. `newsletter@spam.com`).
90
+ - `PROTECTED_EMAIL_PATTERNS`: Add personal or banking emails that should NEVER be deleted by the tool (e.g. `@yourbank.com`).
91
+
92
+ *Lazy, not negligent: The code ensures these patterns are scrubbed from version control, so you can safely keep this repo synced without leaking your personal contacts.*
93
+
94
+ ## Commands
95
+
96
+ | Command | What it does |
97
+ |---------|--------------|
98
+ | `python -m gmail_cleaner.main sync` | Syncs your emails to a local SQLite database. |
99
+ | `python -m gmail_cleaner.main analyze` | Analyzes emails to show top domains, senders, and TF-IDF topic clusters. |
100
+ | `python -m gmail_cleaner.main clean` | Starts the interactive CLI to bulk delete emails based on strategies (newsletters, promotions, etc.). |
101
+ | `python -m gmail_cleaner.main clean --dry-run` | Simulates the deletion queue and prints what *would* be deleted without touching the Gmail API. |
102
+
103
+ *When cleaning, you can select specific email ranges (e.g., `1,2,5-10`, `k10` to keep the latest 10) and accumulate deletions before executing them in bulk by typing `yes` to skip or `now` to process the queue.*
104
+
105
+ ## FAQ
106
+
107
+ **Does it upload my emails to a random server?**
108
+ No. It downloads them to a local SQLite database (`gmail.db`). The TF-IDF analysis, clustering, and deleting all happen entirely on your machine. 100% local.
109
+
110
+ **Will it accidentally delete my tax returns or bank statements?**
111
+ Not if you tell it not to. Add `@yourbank.com` or your accountant's email to `PROTECTED_EMAIL_PATTERNS` in `config.py`. Once there, it becomes structurally impossible for the tool to delete them, even if you explicitly select them.
112
+
113
+ **Can I undo a deletion?**
114
+ Yes. The tool moves emails to the Gmail Trash folder; it doesn't permanently annihilate them. You have 30 days to rescue them through the standard Gmail UI before Google purges them forever.
115
+
116
+ **What if I really need that shoe store newsletter from 2019?**
117
+ You don't. Insist anyway and you can use the `k` command (e.g., `k10`) to keep the latest 10 and delete the rest. But it will judge you silently.
118
+
119
+ ## License
120
+
121
+ [MIT](LICENSE). The shortest license that works.
@@ -0,0 +1,18 @@
1
+ gmail_cleaner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ gmail_cleaner/analyze.py,sha256=VCH8D3SWYkYt2OdgToej21VTFwKDpyMdfUZuZpgKa0k,4152
3
+ gmail_cleaner/auth.py,sha256=c7ySl-h6yIB-ynCgrc25UYtm3JxHSc5-LS88NNbrBCQ,1845
4
+ gmail_cleaner/config.py,sha256=Zaua7CRRiP6lEHL86Ylh3Ayvu2ZOsBDPJOsnRcS1jAQ,790
5
+ gmail_cleaner/db.py,sha256=ybjTN2XsdjppqDPVRJR2BFC3R4Aq8vjBJ03ASJrGcqc,1482
6
+ gmail_cleaner/delete.py,sha256=6y-0DeX4Tp6fCLn8i6blMw_pJX0nn0xRJeB4qlsi9wk,11921
7
+ gmail_cleaner/main.py,sha256=3K3rYGtLzNUt1EavI13K6c-5sTHcuvCiWqtinttmuDw,1763
8
+ gmail_cleaner/sync.py,sha256=p8nv_Mh1giV5mMCF3zvl1zW76rBBOBm_WPDz_6NIvvI,5294
9
+ gmail_cleaner-0.1.0.dist-info/licenses/LICENSE,sha256=XKKSDU9WlUEAyPNlRhq6e2xhVNpJc097JwPZJ1rUnRE,1077
10
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ tests/conftest.py,sha256=DvyhPLgg4opdZqcM7DHO94fM-RurxcHhCpG8SSdO3Es,2816
12
+ tests/test_analyze.py,sha256=WwsbsD0bLTxD9Lm2TprsLpk-8noL_fTZ3q_zDV6R9r0,2149
13
+ tests/test_delete.py,sha256=anqQG5GefrNpMmMa50aT3spXMi0qPqdILbflm6VdBN4,3195
14
+ gmail_cleaner-0.1.0.dist-info/METADATA,sha256=yJ9F4DcCH8eXXU_HHfJrCYL4HveSjxWuy1SiffIjYAg,5654
15
+ gmail_cleaner-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
16
+ gmail_cleaner-0.1.0.dist-info/entry_points.txt,sha256=QWII3zikd5I12tcunQy6ZZfxVzJh2-CpGQaiw9BZdec,58
17
+ gmail_cleaner-0.1.0.dist-info/top_level.txt,sha256=lWdsgkubIg3PgZ1xyYyXpkzWWOSYf0z8w-5ymSC-SNQ,20
18
+ gmail_cleaner-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ gmail-cleaner = gmail_cleaner.main:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ gmail_cleaner
2
+ tests
tests/__init__.py ADDED
File without changes
tests/conftest.py ADDED
@@ -0,0 +1,50 @@
1
+ import pytest
2
+ import sqlite3
3
+ import pandas as pd
4
+ from unittest.mock import patch
5
+
6
+ # Setup synthetic configurations for tests
7
+ @pytest.fixture(autouse=True)
8
+ def mock_config(monkeypatch):
9
+ monkeypatch.setattr("gmail_cleaner.analyze.DB_FILE", ":memory:")
10
+ monkeypatch.setattr("gmail_cleaner.delete.DB_FILE", ":memory:")
11
+ monkeypatch.setattr("gmail_cleaner.config.AUTO_DELETE_EMAIL_PATTERNS", ["spam@trash.com", "@junk.com"])
12
+ monkeypatch.setattr("gmail_cleaner.analyze.AUTO_DELETE_EMAIL_PATTERNS", ["spam@trash.com", "@junk.com"])
13
+
14
+ monkeypatch.setattr("gmail_cleaner.config.PROTECTED_EMAIL_PATTERNS", ["vip@bank.com", "@mywork.com"])
15
+ monkeypatch.setattr("gmail_cleaner.analyze.PROTECTED_EMAIL_PATTERNS", ["vip@bank.com", "@mywork.com"])
16
+
17
+ @pytest.fixture
18
+ def synthetic_df():
19
+ data = [
20
+ # Normal email
21
+ {"id": "msg1", "sender": "friend@gmail.com", "subject": "Lunch?", "snippet": "Hey, lunch tomorrow?", "body_text": "Let me know", "internal_date": 1000},
22
+ # Newsletter
23
+ {"id": "msg2", "sender": "daily@news.com", "subject": "Your daily newsletter", "snippet": "Read more...", "body_text": "Click here to unsubscribe.", "internal_date": 1001},
24
+ # Promotion
25
+ {"id": "msg3", "sender": "marketing@store.com", "subject": "50% Discount on Shoes!", "snippet": "Limited time offer", "body_text": "Big sale today only", "internal_date": 1002},
26
+ # Protected (should normally be filtered by load_data)
27
+ {"id": "msg4", "sender": "alerts@mywork.com", "subject": "Server Down", "snippet": "Urgent", "body_text": "", "internal_date": 1003},
28
+ # Auto Delete
29
+ {"id": "msg5", "sender": "spam@trash.com", "subject": "Win a prize", "snippet": "You won", "body_text": "Click link", "internal_date": 1004},
30
+ # Another from top sender
31
+ {"id": "msg6", "sender": "marketing@store.com", "subject": "Another coupon", "snippet": "Save more", "body_text": "Sale sale sale", "internal_date": 1005},
32
+ # Another newsletter
33
+ {"id": "msg7", "sender": "daily@news.com", "subject": "Weekly digest", "snippet": "More news...", "body_text": "Unsubscribe below.", "internal_date": 1006},
34
+ ]
35
+ df = pd.DataFrame(data)
36
+ from gmail_cleaner.analyze import canonical_email, extract_domain
37
+ df["email"] = df["sender"].apply(canonical_email)
38
+ df["domain"] = df["sender"].apply(extract_domain)
39
+ return df
40
+
41
+ @pytest.fixture
42
+ def mock_db(synthetic_df):
43
+ conn = sqlite3.connect(":memory:")
44
+ synthetic_df.to_sql("gmail_messages", conn, index=False)
45
+
46
+ # Add 'deleted' column as expected by load_data
47
+ conn.execute("ALTER TABLE gmail_messages ADD COLUMN deleted INTEGER NOT NULL DEFAULT 0")
48
+
49
+ with patch("sqlite3.connect", return_value=conn):
50
+ yield conn
tests/test_analyze.py ADDED
@@ -0,0 +1,61 @@
1
+ from gmail_cleaner.analyze import (
2
+ extract_email,
3
+ extract_domain,
4
+ canonical_email,
5
+ matches_pattern,
6
+ matches_protected,
7
+ matches_auto_delete,
8
+ load_data
9
+ )
10
+
11
+
12
+ def test_extract_email():
13
+ assert extract_email("John Doe <john@example.com>") == "john@example.com"
14
+ assert extract_email("john@example.com") == "john@example.com"
15
+ assert extract_email(None) == ""
16
+
17
+
18
+ def test_extract_domain():
19
+ assert extract_domain("John Doe <john@example.com>") == "example.com"
20
+ assert extract_domain("no-reply@sub.domain.com") == "sub.domain.com"
21
+ assert extract_domain("invalid-email") == "(unknown)"
22
+
23
+
24
+ def test_canonical_email():
25
+ assert canonical_email("John Doe <john@example.com>") == "john@example.com"
26
+ assert canonical_email("John Doe <John@Example.COM>") == "john@example.com"
27
+
28
+
29
+ def test_matches_pattern():
30
+ patterns = ["@spam.com", "exact@match.com"]
31
+ assert matches_pattern("user@spam.com", patterns) is True
32
+ assert matches_pattern("exact@match.com", patterns) is True
33
+ assert matches_pattern("user@notspam.com", patterns) is False
34
+
35
+
36
+ def test_matches_protected(mock_config):
37
+ # From conftest mock: ["vip@bank.com", "@mywork.com"]
38
+ assert matches_protected("user@mywork.com") is True
39
+ assert matches_protected("vip@bank.com") is True
40
+ assert matches_protected("other@bank.com") is False
41
+
42
+
43
+ def test_matches_auto_delete(mock_config):
44
+ # From conftest mock: ["spam@trash.com", "@junk.com"]
45
+ assert matches_auto_delete("spam@trash.com") is True
46
+ assert matches_auto_delete("user@junk.com") is True
47
+ assert matches_auto_delete("user@good.com") is False
48
+
49
+
50
+ def test_load_data(mock_db, mock_config):
51
+ df = load_data()
52
+
53
+ # "alerts@mywork.com" should be filtered out because it matches PROTECTED_EMAIL_PATTERNS
54
+ emails = df["email"].tolist()
55
+ assert "alerts@mywork.com" not in emails
56
+
57
+ # "spam@trash.com" should still be there (it gets matched later by auto delete strategies)
58
+ assert "spam@trash.com" in emails
59
+
60
+ # The dataframe should have 6 rows (7 synthetic - 1 protected)
61
+ assert len(df) == 6
tests/test_delete.py ADDED
@@ -0,0 +1,92 @@
1
+ from gmail_cleaner.delete import (
2
+ parse_selection,
3
+ get_newsletter_candidates,
4
+ get_promotion_candidates,
5
+ get_top_sender_candidates,
6
+ get_auto_delete_candidates,
7
+ build_pattern_deletes
8
+ )
9
+
10
+
11
+ def test_parse_selection_single_values():
12
+ assert parse_selection("1", 10) == {0}
13
+ assert parse_selection("1, 3, 5", 10) == {0, 2, 4}
14
+
15
+
16
+ def test_parse_selection_ranges():
17
+ assert parse_selection("1-5", 10) == {0, 1, 2, 3, 4}
18
+ assert parse_selection("1-3, 5-6", 10) == {0, 1, 2, 4, 5}
19
+
20
+
21
+ def test_parse_selection_out_of_bounds():
22
+ # 15 should be ignored as max_index is 10
23
+ assert parse_selection("1, 15", 10) == {0}
24
+ # Invalid parsing falls through gracefully or ignores
25
+ assert parse_selection("-5", 10) == set()
26
+
27
+
28
+ def test_parse_selection_mixed():
29
+ assert parse_selection("1, 3-5, 9", 10) == {0, 2, 3, 4, 8}
30
+
31
+
32
+ def test_get_newsletter_candidates(synthetic_df):
33
+ candidates = list(get_newsletter_candidates(synthetic_df))
34
+ # daily@news.com has 2 newsletter emails
35
+ assert len(candidates) == 1
36
+ assert candidates[0][0] == "daily@news.com"
37
+ assert candidates[0][1] == 2
38
+
39
+
40
+ def test_get_promotion_candidates(synthetic_df):
41
+ candidates = list(get_promotion_candidates(synthetic_df))
42
+ # marketing@store.com has 2 promotions
43
+ assert len(candidates) == 1
44
+ assert candidates[0][0] == "marketing@store.com"
45
+ assert candidates[0][1] == 2
46
+
47
+
48
+ def test_get_top_sender_candidates(synthetic_df):
49
+ candidates = list(get_top_sender_candidates(synthetic_df))
50
+ # marketing and daily both have 2
51
+ senders = [c[0] for c in candidates]
52
+ assert "marketing@store.com" in senders
53
+ assert "daily@news.com" in senders
54
+ assert "friend@gmail.com" in senders
55
+
56
+
57
+ def test_get_auto_delete_candidates(synthetic_df, mock_config):
58
+ candidates = list(get_auto_delete_candidates(synthetic_df))
59
+ assert len(candidates) == 1
60
+ assert candidates[0][0] == "spam@trash.com"
61
+
62
+
63
+ def test_build_pattern_deletes(synthetic_df):
64
+ sender_df = synthetic_df[synthetic_df["email"] == "marketing@store.com"]
65
+ # 2 emails: "50% Discount on Shoes!" (msg3) and "Another coupon" (msg6)
66
+
67
+ # Fake candidate list like what handle_sender creates
68
+ # [(normalized_subject, count)]
69
+ candidates = [
70
+ ("another coupon", 1),
71
+ ("<num>% discount on shoes!", 1)
72
+ ]
73
+
74
+ # Test selecting the first pattern ("another coupon")
75
+ ids = build_pattern_deletes(sender_df, candidates, "1")
76
+ assert len(ids) == 1
77
+ assert ids[0] == "msg6"
78
+
79
+ # Test selecting both patterns ("1, 2")
80
+ ids = build_pattern_deletes(sender_df, candidates, "1, 2")
81
+ assert len(ids) == 2
82
+ assert "msg3" in ids
83
+ assert "msg6" in ids
84
+
85
+ # Test "keep 1" (k1) -> keeps the newest (msg6 which has internal_date 1005), deletes msg3 (1002)
86
+ ids = build_pattern_deletes(sender_df, candidates, "k1")
87
+ assert len(ids) == 1
88
+ assert ids[0] == "msg3"
89
+
90
+ # Test "delete all" (d) -> wait, "d" means don't delete any in the current logic. Let's check `if selection == "d": return []`
91
+ ids = build_pattern_deletes(sender_df, candidates, "d")
92
+ assert len(ids) == 0