email-tagger 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- email_tagger/__init__.py +0 -0
- email_tagger/cache.py +97 -0
- email_tagger/checkpoint.py +71 -0
- email_tagger/classifiers/__init__.py +0 -0
- email_tagger/classifiers/contact_classifier.py +144 -0
- email_tagger/cli.py +516 -0
- email_tagger/cost_estimator.py +67 -0
- email_tagger/io/__init__.py +0 -0
- email_tagger/io/artifacts.py +39 -0
- email_tagger/io/readers.py +94 -0
- email_tagger/io/writers.py +71 -0
- email_tagger/metrics.py +112 -0
- email_tagger/models.py +165 -0
- email_tagger/privacy/__init__.py +138 -0
- email_tagger/privacy/payload_builder.py +67 -0
- email_tagger/privacy/policies.py +42 -0
- email_tagger/privacy/redactor.py +88 -0
- email_tagger/providers/__init__.py +0 -0
- email_tagger/providers/base.py +65 -0
- email_tagger/providers/factory.py +80 -0
- email_tagger/providers/local_provider.py +105 -0
- email_tagger/providers/openai_provider.py +126 -0
- email_tagger/types.py +58 -0
- email_tagger-0.2.0.dist-info/METADATA +181 -0
- email_tagger-0.2.0.dist-info/RECORD +29 -0
- email_tagger-0.2.0.dist-info/WHEEL +5 -0
- email_tagger-0.2.0.dist-info/entry_points.txt +2 -0
- email_tagger-0.2.0.dist-info/licenses/LICENSE +21 -0
- email_tagger-0.2.0.dist-info/top_level.txt +1 -0
email_tagger/__init__.py
ADDED
|
File without changes
|
email_tagger/cache.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""
|
|
2
|
+
email_tagger.cache — Lokalny cache wyników klasyfikacji.
|
|
3
|
+
|
|
4
|
+
Unika wielokrotnego wysyłania tych samych danych do API.
|
|
5
|
+
Klucz cache: hash znormalizowanych danych wejściowych.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import sqlite3
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Optional
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ClassificationCache:
|
|
21
|
+
"""
|
|
22
|
+
SQLite-based cache dla wyników klasyfikacji.
|
|
23
|
+
|
|
24
|
+
Przechowuje hash payloadu -> wynik TagResult jako JSON.
|
|
25
|
+
Domyślnie w ~/.email-tagger/cache.db z TTL 30 dni.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, sciezka: Optional[Path] = None) -> None:
|
|
29
|
+
if sciezka is None:
|
|
30
|
+
sciezka = Path.home() / ".email-tagger" / "cache.db"
|
|
31
|
+
self.sciezka = sciezka
|
|
32
|
+
self.sciezka.parent.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
self._conn: Optional[sqlite3.Connection] = None
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def conn(self) -> sqlite3.Connection:
|
|
37
|
+
if self._conn is None:
|
|
38
|
+
self._conn = sqlite3.connect(str(self.sciezka))
|
|
39
|
+
self._conn.execute(
|
|
40
|
+
"""CREATE TABLE IF NOT EXISTS cache (
|
|
41
|
+
klucz TEXT PRIMARY KEY,
|
|
42
|
+
wynik TEXT NOT NULL,
|
|
43
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
44
|
+
)"""
|
|
45
|
+
)
|
|
46
|
+
self._conn.execute("CREATE INDEX IF NOT EXISTS idx_cache_created ON cache(created_at)")
|
|
47
|
+
self._conn.commit()
|
|
48
|
+
return self._conn
|
|
49
|
+
|
|
50
|
+
def klucz(self, payload: dict[str, Any]) -> str:
|
|
51
|
+
"""Generuje klucz cache z payloadu."""
|
|
52
|
+
raw = json.dumps(payload, sort_keys=True, ensure_ascii=False)
|
|
53
|
+
return hashlib.sha256(raw.encode()).hexdigest()
|
|
54
|
+
|
|
55
|
+
def pobierz(self, payload: dict[str, Any]) -> Optional[dict[str, Any]]:
|
|
56
|
+
"""Próbuje pobrać wynik z cache."""
|
|
57
|
+
k = self.klucz(payload)
|
|
58
|
+
row = self.conn.execute("SELECT wynik FROM cache WHERE klucz = ?", (k,)).fetchone()
|
|
59
|
+
if row:
|
|
60
|
+
logger.debug("Cache HIT: %s", k[:12])
|
|
61
|
+
return json.loads(row[0])
|
|
62
|
+
logger.debug("Cache MISS: %s", k[:12])
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
def zapisz(self, payload: dict[str, Any], wynik: dict[str, Any]) -> None:
|
|
66
|
+
"""Zapisuje wynik do cache."""
|
|
67
|
+
k = self.klucz(payload)
|
|
68
|
+
self.conn.execute(
|
|
69
|
+
"INSERT OR REPLACE INTO cache (klucz, wynik) VALUES (?, ?)",
|
|
70
|
+
(k, json.dumps(wynik, ensure_ascii=False)),
|
|
71
|
+
)
|
|
72
|
+
self.conn.commit()
|
|
73
|
+
|
|
74
|
+
def czysc_stare(self, dni: int = 30) -> int:
|
|
75
|
+
"""Usuwa wpisy starsze niż N dni. Zwraca liczbę usuniętych."""
|
|
76
|
+
usuniete = self.conn.execute(
|
|
77
|
+
"DELETE FROM cache WHERE created_at < datetime('now', ?)",
|
|
78
|
+
(f"-{dni} days",),
|
|
79
|
+
).rowcount
|
|
80
|
+
self.conn.commit()
|
|
81
|
+
if usuniete:
|
|
82
|
+
logger.info("Wyczyszczono %d starych wpisów cache", usuniete)
|
|
83
|
+
return usuniete
|
|
84
|
+
|
|
85
|
+
def statystyki(self) -> dict:
|
|
86
|
+
"""Zwraca statystyki cache."""
|
|
87
|
+
total = self.conn.execute("SELECT COUNT(*) FROM cache").fetchone()[0]
|
|
88
|
+
return {
|
|
89
|
+
"sciezka": str(self.sciezka),
|
|
90
|
+
"wpisy": total,
|
|
91
|
+
"rozmiar_mb": round(self.sciezka.stat().st_size / 1_000_000, 2) if self.sciezka.exists() else 0,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
def zamknij(self) -> None:
|
|
95
|
+
if self._conn:
|
|
96
|
+
self._conn.close()
|
|
97
|
+
self._conn = None
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""
|
|
2
|
+
email_tagger.checkpoint — Przyrostowy checkpoint i resume.
|
|
3
|
+
|
|
4
|
+
Zapisuje postęp po każdym batchu, pozwala wznowić po przerwaniu.
|
|
5
|
+
Krytyczne dla produkcyjnego użycia — bez tego każdy crash = strata całej pracy.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Optional
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Checkpoint:
|
|
19
|
+
"""
|
|
20
|
+
Checkpoint dla przyrostowego przetwarzania batchy.
|
|
21
|
+
|
|
22
|
+
Zapisuje do pliku JSON:
|
|
23
|
+
- indeks ostatniego przetworzonego wiersza
|
|
24
|
+
- całkowitą liczbę wierszy
|
|
25
|
+
- timestamp i model
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, sciezka: Path) -> None:
|
|
29
|
+
self.sciezka = sciezka
|
|
30
|
+
self._dane: dict[str, Any] = {}
|
|
31
|
+
|
|
32
|
+
def wczytaj(self) -> Optional[dict[str, Any]]:
|
|
33
|
+
"""Wczytuje checkpoint z pliku. Zwraca None jeśli nie istnieje."""
|
|
34
|
+
if not self.sciezka.exists():
|
|
35
|
+
return None
|
|
36
|
+
try:
|
|
37
|
+
with open(self.sciezka) as f:
|
|
38
|
+
self._dane = json.load(f)
|
|
39
|
+
logger.info("Wczytano checkpoint: %s", self.sciezka)
|
|
40
|
+
return self._dane
|
|
41
|
+
except (json.JSONDecodeError, OSError) as e:
|
|
42
|
+
logger.warning("Nie można wczytać checkpointu: %s", e)
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
def zapisz(self, indeks: int, ogolem: int, **extra) -> None:
|
|
46
|
+
"""Zapisuje checkpoint."""
|
|
47
|
+
self._dane = {
|
|
48
|
+
"indeks": indeks,
|
|
49
|
+
"ogolem": ogolem,
|
|
50
|
+
"procent": round(indeks / ogolem * 100, 1) if ogolem > 0 else 0,
|
|
51
|
+
**extra,
|
|
52
|
+
}
|
|
53
|
+
self.sciezka.parent.mkdir(parents=True, exist_ok=True)
|
|
54
|
+
with open(self.sciezka, "w") as f:
|
|
55
|
+
json.dump(self._dane, f, indent=2, ensure_ascii=False)
|
|
56
|
+
logger.info("Checkpoint: %d/%d (%.1f%%)", indeks, ogolem, self._dane["procent"])
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def ostatni_indeks(self) -> int:
|
|
60
|
+
"""Zwraca indeks ostatniego przetworzonego wiersza."""
|
|
61
|
+
return self._dane.get("indeks", -1)
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def czy_istnieje(self) -> bool:
|
|
65
|
+
return self.sciezka.exists()
|
|
66
|
+
|
|
67
|
+
def usun(self) -> None:
|
|
68
|
+
"""Usuwa plik checkpointu — po zakończonym runie."""
|
|
69
|
+
if self.sciezka.exists():
|
|
70
|
+
self.sciezka.unlink()
|
|
71
|
+
logger.info("Usunięto checkpoint: %s", self.sciezka)
|
|
File without changes
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""
|
|
2
|
+
email_tagger.classifiers.contact_classifier — Główna logika klasyfikacji.
|
|
3
|
+
|
|
4
|
+
Łączy: dane wejściowe -> redakcja PII -> provider AI -> walidacja -> wynik.
|
|
5
|
+
To jedyna klasa która "wie" o całym przepływie.
|
|
6
|
+
|
|
7
|
+
Każdy kontakt jest przetwarzany TYLKO z dozwolonymi polami,
|
|
8
|
+
zgodnie z profilem prywatności.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import time
|
|
15
|
+
from typing import Any, Optional
|
|
16
|
+
|
|
17
|
+
from email_tagger.cache import ClassificationCache
|
|
18
|
+
from email_tagger.models import ClassifiedContact, TagResult
|
|
19
|
+
from email_tagger.privacy.payload_builder import (
|
|
20
|
+
sformatuj_payload_dla_prompta,
|
|
21
|
+
zbuduj_payload,
|
|
22
|
+
)
|
|
23
|
+
from email_tagger.providers.base import BaseProvider
|
|
24
|
+
from email_tagger.types import PrivacyProfile
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ContactClassifier:
|
|
30
|
+
"""
|
|
31
|
+
Klasyfikator kontaktów.
|
|
32
|
+
|
|
33
|
+
Dla każdego kontaktu:
|
|
34
|
+
1. Buduje bezpieczny payload (redakcja PII, polityka prywatności)
|
|
35
|
+
2. Sprawdza cache
|
|
36
|
+
3. Wysyła do providera AI
|
|
37
|
+
4. Waliduje wynik
|
|
38
|
+
5. Zwraca ClassifiedContact
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
provider: BaseProvider,
|
|
44
|
+
profil: PrivacyProfile = "cloud-minimized",
|
|
45
|
+
cache: Optional[ClassificationCache] = None,
|
|
46
|
+
) -> None:
|
|
47
|
+
self.provider = provider
|
|
48
|
+
self.profil = profil
|
|
49
|
+
self.cache = cache
|
|
50
|
+
|
|
51
|
+
def klasyfikuj(self, kontakt: dict[str, Any]) -> ClassifiedContact:
|
|
52
|
+
"""
|
|
53
|
+
Klasyfikuje pojedynczy kontakt.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
kontakt: Słownik z danymi kontaktu (z CSV).
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
ClassifiedContact z tagami.
|
|
60
|
+
"""
|
|
61
|
+
email = kontakt.get("email", "").strip()
|
|
62
|
+
|
|
63
|
+
# Krok 1: Zbuduj bezpieczny payload
|
|
64
|
+
payload, ostrzezenia = zbuduj_payload(kontakt, self.profil)
|
|
65
|
+
|
|
66
|
+
# Jeśli payload jest pusty — pomiń
|
|
67
|
+
if not payload:
|
|
68
|
+
return ClassifiedContact(
|
|
69
|
+
email=email,
|
|
70
|
+
status="skipped",
|
|
71
|
+
blad="Brak danych do klasyfikacji po redakcji PII",
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Krok 2: Sprawdź cache
|
|
75
|
+
if self.cache:
|
|
76
|
+
cached = self.cache.pobierz(payload)
|
|
77
|
+
if cached:
|
|
78
|
+
return ClassifiedContact(
|
|
79
|
+
email=email,
|
|
80
|
+
first_name=kontakt.get("first_name"),
|
|
81
|
+
last_name=kontakt.get("last_name"),
|
|
82
|
+
name=kontakt.get("name"),
|
|
83
|
+
company=kontakt.get("company"),
|
|
84
|
+
position=kontakt.get("position"),
|
|
85
|
+
industry=kontakt.get("industry"),
|
|
86
|
+
tag_branza=cached.get("branza", "nieznane"),
|
|
87
|
+
tag_rola=cached.get("rola", "nieznane"),
|
|
88
|
+
tag_intencja=cached.get("intencja", "nieokreslona"),
|
|
89
|
+
tag_pewnosc=cached.get("pewnosc", 0.0),
|
|
90
|
+
status="ok",
|
|
91
|
+
blad=None,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Krok 3: Sformatuj dla prompta
|
|
95
|
+
dane_dla_ai = sformatuj_payload_dla_prompta(payload)
|
|
96
|
+
|
|
97
|
+
# Krok 4: Wywołaj provider
|
|
98
|
+
start = time.time()
|
|
99
|
+
try:
|
|
100
|
+
wynik: TagResult = self.provider.klasyfikuj(dane_dla_ai)
|
|
101
|
+
(time.time() - start) * 1000
|
|
102
|
+
except Exception as e:
|
|
103
|
+
(time.time() - start) * 1000
|
|
104
|
+
logger.error("Błąd klasyfikacji %s: %s", email, str(e)[:100])
|
|
105
|
+
return ClassifiedContact(
|
|
106
|
+
email=email,
|
|
107
|
+
tag_branza="nieznane",
|
|
108
|
+
tag_rola="nieznane",
|
|
109
|
+
tag_intencja="nieznane",
|
|
110
|
+
tag_pewnosc=0.0,
|
|
111
|
+
status="error",
|
|
112
|
+
blad=str(e)[:200],
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Krok 5: Zapisz w cache
|
|
116
|
+
wynik_dict = {
|
|
117
|
+
"branza": wynik.branza.value,
|
|
118
|
+
"rola": wynik.rola.value,
|
|
119
|
+
"intencja": wynik.intencja.value,
|
|
120
|
+
"pewnosc": wynik.pewnosc,
|
|
121
|
+
}
|
|
122
|
+
if self.cache:
|
|
123
|
+
self.cache.zapisz(payload, wynik_dict)
|
|
124
|
+
|
|
125
|
+
# Krok 6: Zwróć rezultat
|
|
126
|
+
return ClassifiedContact(
|
|
127
|
+
email=email,
|
|
128
|
+
first_name=kontakt.get("first_name"),
|
|
129
|
+
last_name=kontakt.get("last_name"),
|
|
130
|
+
name=kontakt.get("name"),
|
|
131
|
+
company=kontakt.get("company"),
|
|
132
|
+
position=kontakt.get("position") or kontakt.get("title"),
|
|
133
|
+
industry=kontakt.get("industry"),
|
|
134
|
+
notes=kontakt.get("notes"),
|
|
135
|
+
website=kontakt.get("website"),
|
|
136
|
+
city=kontakt.get("city"),
|
|
137
|
+
country=kontakt.get("country"),
|
|
138
|
+
tag_branza=wynik.branza.value,
|
|
139
|
+
tag_rola=wynik.rola.value,
|
|
140
|
+
tag_intencja=wynik.intencja.value,
|
|
141
|
+
tag_pewnosc=wynik.pewnosc,
|
|
142
|
+
status="ok",
|
|
143
|
+
blad=None,
|
|
144
|
+
)
|