gmail-cleaner 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gmail_cleaner-0.1.0/LICENSE +21 -0
- gmail_cleaner-0.1.0/PKG-INFO +121 -0
- gmail_cleaner-0.1.0/README.md +105 -0
- gmail_cleaner-0.1.0/gmail_cleaner/__init__.py +0 -0
- gmail_cleaner-0.1.0/gmail_cleaner/analyze.py +131 -0
- gmail_cleaner-0.1.0/gmail_cleaner/auth.py +57 -0
- gmail_cleaner-0.1.0/gmail_cleaner/config.py +28 -0
- gmail_cleaner-0.1.0/gmail_cleaner/db.py +68 -0
- gmail_cleaner-0.1.0/gmail_cleaner/delete.py +341 -0
- gmail_cleaner-0.1.0/gmail_cleaner/main.py +60 -0
- gmail_cleaner-0.1.0/gmail_cleaner/sync.py +160 -0
- gmail_cleaner-0.1.0/gmail_cleaner.egg-info/PKG-INFO +121 -0
- gmail_cleaner-0.1.0/gmail_cleaner.egg-info/SOURCES.txt +21 -0
- gmail_cleaner-0.1.0/gmail_cleaner.egg-info/dependency_links.txt +1 -0
- gmail_cleaner-0.1.0/gmail_cleaner.egg-info/entry_points.txt +2 -0
- gmail_cleaner-0.1.0/gmail_cleaner.egg-info/requires.txt +6 -0
- gmail_cleaner-0.1.0/gmail_cleaner.egg-info/top_level.txt +4 -0
- gmail_cleaner-0.1.0/pyproject.toml +34 -0
- gmail_cleaner-0.1.0/setup.cfg +4 -0
- gmail_cleaner-0.1.0/tests/__init__.py +0 -0
- gmail_cleaner-0.1.0/tests/conftest.py +50 -0
- gmail_cleaner-0.1.0/tests/test_analyze.py +61 -0
- gmail_cleaner-0.1.0/tests/test_delete.py +92 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gmail_cleaner
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A powerful local CLI tool to sync, cluster, and surgically clean your Gmail.
|
|
5
|
+
Author-email: Mayank Gupta <mayankgupta690@gmail.com>
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: google-api-python-client
|
|
10
|
+
Requires-Dist: google-auth-httplib2
|
|
11
|
+
Requires-Dist: google-auth-oauthlib
|
|
12
|
+
Requires-Dist: pandas
|
|
13
|
+
Requires-Dist: scikit-learn
|
|
14
|
+
Requires-Dist: tqdm
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
<p align="center">
|
|
18
|
+
<img src="assets/logo.png" width="220" alt="Gmail Cleaner Logo">
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
<h1 align="center">Gmail Cleaner</h1>
|
|
22
|
+
|
|
23
|
+
<p align="center">
|
|
24
|
+
<em>Because somewhere in those 50,000 emails is a tax document you actually need.</em>
|
|
25
|
+
</p>
|
|
26
|
+
|
|
27
|
+
<p align="center">
|
|
28
|
+
<a href="https://pypi.org/project/gmail-cleaner/"><img src="https://img.shields.io/pypi/v/gmail-cleaner.svg?color=blue" alt="PyPI version"></a>
|
|
29
|
+
<a href="https://codecov.io/gh/immkg/gmail-cleaner"><img src="https://codecov.io/gh/immkg/gmail-cleaner/branch/main/graph/badge.svg" alt="Coverage"></a>
|
|
30
|
+
<a href="https://github.com/immkg/gmail-cleaner/releases/latest"><img src="https://img.shields.io/github/v/release/immkg/gmail-cleaner?style=flat-square&color=brightgreen&label=release" alt="Release"></a>
|
|
31
|
+
<img src="https://img.shields.io/badge/license-MIT-111111" alt="MIT license">
|
|
32
|
+
</p>
|
|
33
|
+
|
|
34
|
+
<p align="center">
|
|
35
|
+
<strong>Built by Mayank Gupta</strong><br>
|
|
36
|
+
<sub>Surgical precision for your inbox. ~99% less clutter · ~100% more sanity · 100% local.</sub>
|
|
37
|
+
</p>
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
You have 50,000 emails. Somewhere in that mountain of newsletters, random promotions, and auto-generated alerts is a critical message from your bank that you cannot afford to lose. You can't just 'Select All -> Delete'.
|
|
42
|
+
|
|
43
|
+
You need a surgical tool.
|
|
44
|
+
|
|
45
|
+
With Gmail Cleaner:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# It just deletes them.
|
|
49
|
+
python -m gmail_cleaner.main clean
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Gmail Cleaner synchronizes your Gmail account into a local SQLite database, analyzes your emails using Pandas and Scikit-Learn, and interactively bulk-deletes the noise based on aggressive, local strategies.
|
|
53
|
+
|
|
54
|
+
## Before / after
|
|
55
|
+
|
|
56
|
+
You try to find an important bank email. Your search results are flooded with 400 "Limited Time Offer!" emails from a newsletter you never subscribed to.
|
|
57
|
+
|
|
58
|
+
After Gmail Cleaner: You actually see your bank email.
|
|
59
|
+
|
|
60
|
+
## Setup
|
|
61
|
+
|
|
62
|
+
The most effort Gmail Cleaner will ever ask of you:
|
|
63
|
+
|
|
64
|
+
### 1. Enable the Gmail API
|
|
65
|
+
1. Go to the [Google Cloud Console](https://console.cloud.google.com/).
|
|
66
|
+
2. Create a new project or select an existing one.
|
|
67
|
+
3. Navigate to **APIs & Services > Library**.
|
|
68
|
+
4. Search for "Gmail API" and click **Enable**.
|
|
69
|
+
|
|
70
|
+
### 2. Set Up Desktop App Credentials
|
|
71
|
+
1. Go to **APIs & Services > OAuth consent screen**. Choose **External** and add your own Gmail address under **Test users**.
|
|
72
|
+
2. Go to **APIs & Services > Credentials**.
|
|
73
|
+
3. Click **Create Credentials** > **OAuth client ID**.
|
|
74
|
+
4. Choose **Desktop app**.
|
|
75
|
+
5. Click **Download JSON** on the confirmation dialog.
|
|
76
|
+
6. Rename the file to `credentials.json` and place it in the root folder.
|
|
77
|
+
|
|
78
|
+
### 3. Install
|
|
79
|
+
You can install directly from PyPI or download the standalone `.exe` from the latest GitHub Release.
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install gmail-cleaner
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Configuration
|
|
86
|
+
|
|
87
|
+
Before running the clean command, open `gmail_cleaner/config.py` and customize your patterns:
|
|
88
|
+
|
|
89
|
+
- `AUTO_DELETE_EMAIL_PATTERNS`: Add email addresses or domains that you always want to delete instantly (e.g. `newsletter@spam.com`).
|
|
90
|
+
- `PROTECTED_EMAIL_PATTERNS`: Add personal or banking emails that should NEVER be deleted by the tool (e.g. `@yourbank.com`).
|
|
91
|
+
|
|
92
|
+
*Lazy, not negligent: The code ensures these patterns are scrubbed from version control, so you can safely keep this repo synced without leaking your personal contacts.*
|
|
93
|
+
|
|
94
|
+
## Commands
|
|
95
|
+
|
|
96
|
+
| Command | What it does |
|
|
97
|
+
|---------|--------------|
|
|
98
|
+
| `python -m gmail_cleaner.main sync` | Syncs your emails to a local SQLite database. |
|
|
99
|
+
| `python -m gmail_cleaner.main analyze` | Analyzes emails to show top domains, senders, and TF-IDF topic clusters. |
|
|
100
|
+
| `python -m gmail_cleaner.main clean` | Starts the interactive CLI to bulk delete emails based on strategies (newsletters, promotions, etc.). |
|
|
101
|
+
| `python -m gmail_cleaner.main clean --dry-run` | Simulates the deletion queue and prints what *would* be deleted without touching the Gmail API. |
|
|
102
|
+
|
|
103
|
+
*When cleaning, you can select specific email ranges (e.g., `1,2,5-10`, `k10` to keep the latest 10) and accumulate deletions before executing them in bulk by typing `yes` to skip or `now` to process the queue.*
|
|
104
|
+
|
|
105
|
+
## FAQ
|
|
106
|
+
|
|
107
|
+
**Does it upload my emails to a random server?**
|
|
108
|
+
No. It downloads them to a local SQLite database (`gmail.db`). The TF-IDF analysis, clustering, and deleting all happen entirely on your machine. 100% local.
|
|
109
|
+
|
|
110
|
+
**Will it accidentally delete my tax returns or bank statements?**
|
|
111
|
+
Not if you tell it not to. Add `@yourbank.com` or your accountant's email to `PROTECTED_EMAIL_PATTERNS` in `config.py`. Once there, it becomes structurally impossible for the tool to delete them, even if you explicitly select them.
|
|
112
|
+
|
|
113
|
+
**Can I undo a deletion?**
|
|
114
|
+
Yes. The tool moves emails to the Gmail Trash folder; it doesn't permanently annihilate them. You have 30 days to rescue them through the standard Gmail UI before Google purges them forever.
|
|
115
|
+
|
|
116
|
+
**What if I really need that shoe store newsletter from 2019?**
|
|
117
|
+
You don't. Insist anyway and you can use the `k` command (e.g., `k10`) to keep the latest 10 and delete the rest. But it will judge you silently.
|
|
118
|
+
|
|
119
|
+
## License
|
|
120
|
+
|
|
121
|
+
[MIT](LICENSE). The shortest license that works.
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="assets/logo.png" width="220" alt="Gmail Cleaner Logo">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">Gmail Cleaner</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<em>Because somewhere in those 50,000 emails is a tax document you actually need.</em>
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
<p align="center">
|
|
12
|
+
<a href="https://pypi.org/project/gmail-cleaner/"><img src="https://img.shields.io/pypi/v/gmail-cleaner.svg?color=blue" alt="PyPI version"></a>
|
|
13
|
+
<a href="https://codecov.io/gh/immkg/gmail-cleaner"><img src="https://codecov.io/gh/immkg/gmail-cleaner/branch/main/graph/badge.svg" alt="Coverage"></a>
|
|
14
|
+
<a href="https://github.com/immkg/gmail-cleaner/releases/latest"><img src="https://img.shields.io/github/v/release/immkg/gmail-cleaner?style=flat-square&color=brightgreen&label=release" alt="Release"></a>
|
|
15
|
+
<img src="https://img.shields.io/badge/license-MIT-111111" alt="MIT license">
|
|
16
|
+
</p>
|
|
17
|
+
|
|
18
|
+
<p align="center">
|
|
19
|
+
<strong>Built by Mayank Gupta</strong><br>
|
|
20
|
+
<sub>Surgical precision for your inbox. ~99% less clutter · ~100% more sanity · 100% local.</sub>
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
You have 50,000 emails. Somewhere in that mountain of newsletters, random promotions, and auto-generated alerts is a critical message from your bank that you cannot afford to lose. You can't just 'Select All -> Delete'.
|
|
26
|
+
|
|
27
|
+
You need a surgical tool.
|
|
28
|
+
|
|
29
|
+
With Gmail Cleaner:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
# It just deletes them.
|
|
33
|
+
python -m gmail_cleaner.main clean
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Gmail Cleaner synchronizes your Gmail account into a local SQLite database, analyzes your emails using Pandas and Scikit-Learn, and interactively bulk-deletes the noise based on aggressive, local strategies.
|
|
37
|
+
|
|
38
|
+
## Before / after
|
|
39
|
+
|
|
40
|
+
You try to find an important bank email. Your search results are flooded with 400 "Limited Time Offer!" emails from a newsletter you never subscribed to.
|
|
41
|
+
|
|
42
|
+
After Gmail Cleaner: You actually see your bank email.
|
|
43
|
+
|
|
44
|
+
## Setup
|
|
45
|
+
|
|
46
|
+
The most effort Gmail Cleaner will ever ask of you:
|
|
47
|
+
|
|
48
|
+
### 1. Enable the Gmail API
|
|
49
|
+
1. Go to the [Google Cloud Console](https://console.cloud.google.com/).
|
|
50
|
+
2. Create a new project or select an existing one.
|
|
51
|
+
3. Navigate to **APIs & Services > Library**.
|
|
52
|
+
4. Search for "Gmail API" and click **Enable**.
|
|
53
|
+
|
|
54
|
+
### 2. Set Up Desktop App Credentials
|
|
55
|
+
1. Go to **APIs & Services > OAuth consent screen**. Choose **External** and add your own Gmail address under **Test users**.
|
|
56
|
+
2. Go to **APIs & Services > Credentials**.
|
|
57
|
+
3. Click **Create Credentials** > **OAuth client ID**.
|
|
58
|
+
4. Choose **Desktop app**.
|
|
59
|
+
5. Click **Download JSON** on the confirmation dialog.
|
|
60
|
+
6. Rename the file to `credentials.json` and place it in the root folder.
|
|
61
|
+
|
|
62
|
+
### 3. Install
|
|
63
|
+
You can install directly from PyPI or download the standalone `.exe` from the latest GitHub Release.
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install gmail-cleaner
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Configuration
|
|
70
|
+
|
|
71
|
+
Before running the clean command, open `gmail_cleaner/config.py` and customize your patterns:
|
|
72
|
+
|
|
73
|
+
- `AUTO_DELETE_EMAIL_PATTERNS`: Add email addresses or domains that you always want to delete instantly (e.g. `newsletter@spam.com`).
|
|
74
|
+
- `PROTECTED_EMAIL_PATTERNS`: Add personal or banking emails that should NEVER be deleted by the tool (e.g. `@yourbank.com`).
|
|
75
|
+
|
|
76
|
+
*Lazy, not negligent: The code ensures these patterns are scrubbed from version control, so you can safely keep this repo synced without leaking your personal contacts.*
|
|
77
|
+
|
|
78
|
+
## Commands
|
|
79
|
+
|
|
80
|
+
| Command | What it does |
|
|
81
|
+
|---------|--------------|
|
|
82
|
+
| `python -m gmail_cleaner.main sync` | Syncs your emails to a local SQLite database. |
|
|
83
|
+
| `python -m gmail_cleaner.main analyze` | Analyzes emails to show top domains, senders, and TF-IDF topic clusters. |
|
|
84
|
+
| `python -m gmail_cleaner.main clean` | Starts the interactive CLI to bulk delete emails based on strategies (newsletters, promotions, etc.). |
|
|
85
|
+
| `python -m gmail_cleaner.main clean --dry-run` | Simulates the deletion queue and prints what *would* be deleted without touching the Gmail API. |
|
|
86
|
+
|
|
87
|
+
*When cleaning, you can select specific email ranges (e.g., `1,2,5-10`, `k10` to keep the latest 10) and accumulate deletions before executing them in bulk by typing `yes` to skip or `now` to process the queue.*
|
|
88
|
+
|
|
89
|
+
## FAQ
|
|
90
|
+
|
|
91
|
+
**Does it upload my emails to a random server?**
|
|
92
|
+
No. It downloads them to a local SQLite database (`gmail.db`). The TF-IDF analysis, clustering, and deleting all happen entirely on your machine. 100% local.
|
|
93
|
+
|
|
94
|
+
**Will it accidentally delete my tax returns or bank statements?**
|
|
95
|
+
Not if you tell it not to. Add `@yourbank.com` or your accountant's email to `PROTECTED_EMAIL_PATTERNS` in `config.py`. Once there, it becomes structurally impossible for the tool to delete them, even if you explicitly select them.
|
|
96
|
+
|
|
97
|
+
**Can I undo a deletion?**
|
|
98
|
+
Yes. The tool moves emails to the Gmail Trash folder; it doesn't permanently annihilate them. You have 30 days to rescue them through the standard Gmail UI before Google purges them forever.
|
|
99
|
+
|
|
100
|
+
**What if I really need that shoe store newsletter from 2019?**
|
|
101
|
+
You don't. Insist anyway and you can use the `k` command (e.g., `k10`) to keep the latest 10 and delete the rest. But it will judge you silently.
|
|
102
|
+
|
|
103
|
+
## License
|
|
104
|
+
|
|
105
|
+
[MIT](LICENSE). The shortest license that works.
|
|
File without changes
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
import re
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from email.utils import parseaddr
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from sklearn.cluster import MiniBatchKMeans
|
|
8
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
9
|
+
from sklearn.feature_extraction import text as sklearn_text
|
|
10
|
+
|
|
11
|
+
from gmail_cleaner.config import DB_FILE, AUTO_DELETE_EMAIL_PATTERNS, PROTECTED_EMAIL_PATTERNS
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def matches_pattern(email, patterns):
|
|
15
|
+
email = (email or "").lower()
|
|
16
|
+
for pattern in patterns:
|
|
17
|
+
pattern = pattern.lower()
|
|
18
|
+
if pattern.startswith("@"):
|
|
19
|
+
domain = pattern[1:]
|
|
20
|
+
if email.endswith("@" + domain) or email.endswith("." + domain):
|
|
21
|
+
return True
|
|
22
|
+
elif email == pattern:
|
|
23
|
+
return True
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def extract_email(sender):
|
|
28
|
+
return parseaddr(sender or "")[1].lower()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def extract_domain(sender):
|
|
32
|
+
email = extract_email(sender)
|
|
33
|
+
if "@" not in email:
|
|
34
|
+
return "(unknown)"
|
|
35
|
+
return email.split("@", 1)[1].lower()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def canonical_email(sender):
|
|
39
|
+
email = extract_email(sender)
|
|
40
|
+
return email.lower() if email else ""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def matches_auto_delete(email):
|
|
44
|
+
return matches_pattern(email, AUTO_DELETE_EMAIL_PATTERNS)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def matches_protected(email):
|
|
48
|
+
return matches_pattern(email, PROTECTED_EMAIL_PATTERNS)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def load_data():
|
|
52
|
+
conn = sqlite3.connect(DB_FILE)
|
|
53
|
+
df = pd.read_sql(
|
|
54
|
+
"""
|
|
55
|
+
SELECT id, sender, subject, internal_date, snippet, body_text, deleted
|
|
56
|
+
FROM gmail_messages
|
|
57
|
+
WHERE COALESCE(deleted, 0) = 0
|
|
58
|
+
""",
|
|
59
|
+
conn,
|
|
60
|
+
)
|
|
61
|
+
conn.close()
|
|
62
|
+
|
|
63
|
+
df["email"] = df["sender"].fillna("").apply(canonical_email)
|
|
64
|
+
df = df[~df["email"].apply(matches_protected)]
|
|
65
|
+
df["domain"] = df["sender"].fillna("").apply(extract_domain)
|
|
66
|
+
|
|
67
|
+
return df
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def run_analysis():
|
|
71
|
+
df = load_data()
|
|
72
|
+
|
|
73
|
+
print("\n" + "=" * 100)
|
|
74
|
+
print(f"EMAILS: {len(df):,}")
|
|
75
|
+
print("=" * 100)
|
|
76
|
+
|
|
77
|
+
# --------------------------------------------------
|
|
78
|
+
# TOP DOMAINS
|
|
79
|
+
# --------------------------------------------------
|
|
80
|
+
print("\nTOP DOMAINS")
|
|
81
|
+
print("-" * 100)
|
|
82
|
+
for domain, count in df["domain"].value_counts().head(50).items():
|
|
83
|
+
print(f"{count:8,d} {domain}")
|
|
84
|
+
|
|
85
|
+
# --------------------------------------------------
|
|
86
|
+
# TOP SENDERS
|
|
87
|
+
# --------------------------------------------------
|
|
88
|
+
print("\nTOP SENDERS")
|
|
89
|
+
print("-" * 100)
|
|
90
|
+
for sender, count in df["email"].value_counts().head(100).items():
|
|
91
|
+
print(f"{count:8,d} {sender}")
|
|
92
|
+
|
|
93
|
+
# --------------------------------------------------
|
|
94
|
+
# SUBJECT WORD ANALYSIS
|
|
95
|
+
# --------------------------------------------------
|
|
96
|
+
subject_words = []
|
|
97
|
+
for subject in df["subject"].fillna(""):
|
|
98
|
+
tokens = re.findall(r"[a-zA-Z]{4,}", subject.lower())
|
|
99
|
+
subject_words.extend(tokens)
|
|
100
|
+
|
|
101
|
+
print("\nTOP SUBJECT WORDS")
|
|
102
|
+
print("-" * 100)
|
|
103
|
+
for word, count in Counter(subject_words).most_common(50):
|
|
104
|
+
print(f"{count:8,d} {word}")
|
|
105
|
+
|
|
106
|
+
# --------------------------------------------------
|
|
107
|
+
# TF-IDF TOPICS
|
|
108
|
+
# --------------------------------------------------
|
|
109
|
+
print("\nTOPIC CLUSTERS")
|
|
110
|
+
print("-" * 100)
|
|
111
|
+
documents = df["subject"].fillna("") + " " + df["snippet"].fillna("")
|
|
112
|
+
vectorizer = TfidfVectorizer(
|
|
113
|
+
stop_words="english", max_features=5000, min_df=5)
|
|
114
|
+
|
|
115
|
+
if len(documents) > 0:
|
|
116
|
+
X = vectorizer.fit_transform(documents)
|
|
117
|
+
cluster_count = min(20, max(2, len(df) // 100))
|
|
118
|
+
model = MiniBatchKMeans(n_clusters=cluster_count,
|
|
119
|
+
random_state=42, batch_size=2048)
|
|
120
|
+
model.fit(X)
|
|
121
|
+
terms = vectorizer.get_feature_names_out()
|
|
122
|
+
|
|
123
|
+
for cluster_id in range(cluster_count):
|
|
124
|
+
center = model.cluster_centers_[cluster_id]
|
|
125
|
+
top_indices = center.argsort()[-12:][::-1]
|
|
126
|
+
keywords = [terms[i] for i in top_indices]
|
|
127
|
+
size = (model.labels_ == cluster_id).sum()
|
|
128
|
+
print(f"\nCluster {cluster_id + 1} ({size:,} emails)")
|
|
129
|
+
print(", ".join(keywords))
|
|
130
|
+
|
|
131
|
+
print("\nDone.")
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import random
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from google.oauth2.credentials import Credentials
|
|
6
|
+
from google.auth.transport.requests import Request
|
|
7
|
+
from google_auth_oauthlib.flow import InstalledAppFlow
|
|
8
|
+
from googleapiclient.discovery import build
|
|
9
|
+
from googleapiclient.errors import HttpError
|
|
10
|
+
|
|
11
|
+
from gmail_cleaner.config import SCOPES, TOKEN_FILE, CREDS_FILE, MAX_RETRIES
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_gmail_service():
|
|
15
|
+
creds = None
|
|
16
|
+
|
|
17
|
+
if Path(TOKEN_FILE).exists():
|
|
18
|
+
creds = Credentials.from_authorized_user_file(TOKEN_FILE, SCOPES)
|
|
19
|
+
|
|
20
|
+
if not creds or not creds.valid:
|
|
21
|
+
if creds and creds.expired and creds.refresh_token:
|
|
22
|
+
creds.refresh(Request())
|
|
23
|
+
else:
|
|
24
|
+
flow = InstalledAppFlow.from_client_secrets_file(
|
|
25
|
+
CREDS_FILE, SCOPES)
|
|
26
|
+
creds = flow.run_local_server(port=0)
|
|
27
|
+
|
|
28
|
+
with open(TOKEN_FILE, "w") as f:
|
|
29
|
+
f.write(creds.to_json())
|
|
30
|
+
|
|
31
|
+
return build("gmail", "v1", credentials=creds, cache_discovery=False)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def gmail_call(request):
|
|
35
|
+
for attempt in range(MAX_RETRIES):
|
|
36
|
+
try:
|
|
37
|
+
result = request.execute(num_retries=3)
|
|
38
|
+
return result
|
|
39
|
+
except HttpError as e:
|
|
40
|
+
status = getattr(e.resp, "status", None)
|
|
41
|
+
if status in (429, 500, 502, 503, 504):
|
|
42
|
+
wait = min(60, 2 ** attempt) + random.random()
|
|
43
|
+
print(
|
|
44
|
+
f"\nHTTP {status} retry {attempt+1}/{MAX_RETRIES} wait={wait:.1f}s")
|
|
45
|
+
time.sleep(wait)
|
|
46
|
+
continue
|
|
47
|
+
raise
|
|
48
|
+
raise RuntimeError("Max retries exceeded")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def trash_email(gmail_id):
|
|
52
|
+
service = get_gmail_service()
|
|
53
|
+
try:
|
|
54
|
+
service.users().messages().trash(userId="me", id=gmail_id).execute()
|
|
55
|
+
return True
|
|
56
|
+
except Exception as e:
|
|
57
|
+
return str(e)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# config.py
|
|
2
|
+
|
|
3
|
+
DB_FILE = "gmail.db"
|
|
4
|
+
CREDS_FILE = "credentials.json"
|
|
5
|
+
TOKEN_FILE = "token.json"
|
|
6
|
+
|
|
7
|
+
SCOPES = [
|
|
8
|
+
"https://www.googleapis.com/auth/gmail.modify"
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
MAX_WORKERS = 20
|
|
12
|
+
MAX_RETRIES = 8
|
|
13
|
+
COMMIT_BATCH = 100
|
|
14
|
+
|
|
15
|
+
# --------------------------------------------------
|
|
16
|
+
# CONFIGURABLE PATTERNS
|
|
17
|
+
# --------------------------------------------------
|
|
18
|
+
# Add email addresses or domains you want to automatically categorize for deletion here.
|
|
19
|
+
# Example: "newsletter@example.com", "@spamdomain.com"
|
|
20
|
+
AUTO_DELETE_EMAIL_PATTERNS = [
|
|
21
|
+
# Add your auto delete patterns here
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
# Add email addresses or domains you want to protect from accidental deletion.
|
|
25
|
+
# Example: "personal@gmail.com", "@mybank.com"
|
|
26
|
+
PROTECTED_EMAIL_PATTERNS = [
|
|
27
|
+
# Add your protected email patterns here
|
|
28
|
+
]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
from gmail_cleaner.config import DB_FILE
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def init_db():
|
|
6
|
+
conn = sqlite3.connect(
|
|
7
|
+
DB_FILE,
|
|
8
|
+
check_same_thread=False
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
conn.execute("""
|
|
12
|
+
CREATE TABLE IF NOT EXISTS gmail_messages (
|
|
13
|
+
id TEXT PRIMARY KEY,
|
|
14
|
+
thread_id TEXT,
|
|
15
|
+
history_id INTEGER,
|
|
16
|
+
internal_date INTEGER,
|
|
17
|
+
label_ids TEXT,
|
|
18
|
+
subject TEXT,
|
|
19
|
+
sender TEXT,
|
|
20
|
+
recipients TEXT,
|
|
21
|
+
snippet TEXT,
|
|
22
|
+
body_text TEXT,
|
|
23
|
+
raw_json TEXT,
|
|
24
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
25
|
+
)
|
|
26
|
+
""")
|
|
27
|
+
|
|
28
|
+
conn.execute("""
|
|
29
|
+
CREATE TABLE IF NOT EXISTS sync_state (
|
|
30
|
+
key TEXT PRIMARY KEY,
|
|
31
|
+
value TEXT
|
|
32
|
+
)
|
|
33
|
+
""")
|
|
34
|
+
|
|
35
|
+
conn.execute("""
|
|
36
|
+
CREATE INDEX IF NOT EXISTS idx_internal_date
|
|
37
|
+
ON gmail_messages(internal_date)
|
|
38
|
+
""")
|
|
39
|
+
|
|
40
|
+
# From analyze.py
|
|
41
|
+
try:
|
|
42
|
+
conn.execute("""
|
|
43
|
+
ALTER TABLE gmail_messages
|
|
44
|
+
ADD COLUMN deleted INTEGER NOT NULL DEFAULT 0
|
|
45
|
+
""")
|
|
46
|
+
except sqlite3.OperationalError:
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
conn.commit()
|
|
50
|
+
return conn
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_state(conn, key):
|
|
54
|
+
row = conn.execute(
|
|
55
|
+
"SELECT value FROM sync_state WHERE key=?",
|
|
56
|
+
(key,)
|
|
57
|
+
).fetchone()
|
|
58
|
+
return row[0] if row else None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def set_state(conn, key, value):
|
|
62
|
+
conn.execute(
|
|
63
|
+
"""
|
|
64
|
+
INSERT OR REPLACE INTO sync_state(key,value)
|
|
65
|
+
VALUES (?,?)
|
|
66
|
+
""",
|
|
67
|
+
(key, str(value))
|
|
68
|
+
)
|