synth-ai 0.2.4.dev4__py3-none-any.whl → 0.2.4.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synth_ai/environments/examples/__init__.py +1 -0
- synth_ai/environments/examples/crafter_classic/__init__.py +8 -0
- synth_ai/environments/examples/crafter_classic/config_logging.py +111 -0
- synth_ai/environments/examples/crafter_classic/debug_translation.py +0 -0
- synth_ai/environments/examples/crafter_classic/engine.py +575 -0
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +63 -0
- synth_ai/environments/examples/crafter_classic/engine_helpers/action_map.py +5 -0
- synth_ai/environments/examples/crafter_classic/engine_helpers/serialization.py +74 -0
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +266 -0
- synth_ai/environments/examples/crafter_classic/environment.py +364 -0
- synth_ai/environments/examples/crafter_classic/taskset.py +233 -0
- synth_ai/environments/examples/crafter_classic/trace_hooks_v3.py +229 -0
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +298 -0
- synth_ai/environments/examples/crafter_custom/__init__.py +4 -0
- synth_ai/environments/examples/crafter_custom/crafter/__init__.py +7 -0
- synth_ai/environments/examples/crafter_custom/crafter/config.py +182 -0
- synth_ai/environments/examples/crafter_custom/crafter/constants.py +8 -0
- synth_ai/environments/examples/crafter_custom/crafter/engine.py +269 -0
- synth_ai/environments/examples/crafter_custom/crafter/env.py +266 -0
- synth_ai/environments/examples/crafter_custom/crafter/objects.py +418 -0
- synth_ai/environments/examples/crafter_custom/crafter/recorder.py +187 -0
- synth_ai/environments/examples/crafter_custom/crafter/worldgen.py +119 -0
- synth_ai/environments/examples/crafter_custom/dataset_builder.py +373 -0
- synth_ai/environments/examples/crafter_custom/environment.py +312 -0
- synth_ai/environments/examples/crafter_custom/run_dataset.py +305 -0
- synth_ai/environments/examples/enron/art_helpers/email_search_tools.py +156 -0
- synth_ai/environments/examples/enron/art_helpers/local_email_db.py +280 -0
- synth_ai/environments/examples/enron/art_helpers/types_enron.py +24 -0
- synth_ai/environments/examples/enron/engine.py +291 -0
- synth_ai/environments/examples/enron/environment.py +165 -0
- synth_ai/environments/examples/enron/taskset.py +112 -0
- synth_ai/environments/examples/minigrid/__init__.py +48 -0
- synth_ai/environments/examples/minigrid/engine.py +589 -0
- synth_ai/environments/examples/minigrid/environment.py +274 -0
- synth_ai/environments/examples/minigrid/environment_mapping.py +242 -0
- synth_ai/environments/examples/minigrid/puzzle_loader.py +416 -0
- synth_ai/environments/examples/minigrid/taskset.py +583 -0
- synth_ai/environments/examples/nethack/__init__.py +7 -0
- synth_ai/environments/examples/nethack/achievements.py +337 -0
- synth_ai/environments/examples/nethack/engine.py +738 -0
- synth_ai/environments/examples/nethack/environment.py +255 -0
- synth_ai/environments/examples/nethack/helpers/__init__.py +42 -0
- synth_ai/environments/examples/nethack/helpers/action_mapping.py +301 -0
- synth_ai/environments/examples/nethack/helpers/nle_wrapper.py +401 -0
- synth_ai/environments/examples/nethack/helpers/observation_utils.py +433 -0
- synth_ai/environments/examples/nethack/helpers/recording_wrapper.py +201 -0
- synth_ai/environments/examples/nethack/helpers/trajectory_recorder.py +268 -0
- synth_ai/environments/examples/nethack/helpers/visualization/replay_viewer.py +308 -0
- synth_ai/environments/examples/nethack/helpers/visualization/visualizer.py +430 -0
- synth_ai/environments/examples/nethack/taskset.py +323 -0
- synth_ai/environments/examples/red/__init__.py +7 -0
- synth_ai/environments/examples/red/config_logging.py +110 -0
- synth_ai/environments/examples/red/engine.py +693 -0
- synth_ai/environments/examples/red/engine_helpers/__init__.py +1 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +28 -0
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +275 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/__init__.py +142 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/adaptive_rewards.py +56 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/battle_rewards.py +283 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/composite_rewards.py +149 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/economy_rewards.py +137 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/efficiency_rewards.py +56 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/exploration_rewards.py +330 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/novelty_rewards.py +120 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_rewards.py +558 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pokemon_rewards.py +312 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/social_rewards.py +147 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/story_rewards.py +246 -0
- synth_ai/environments/examples/red/engine_helpers/screen_analysis.py +367 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +139 -0
- synth_ai/environments/examples/red/environment.py +235 -0
- synth_ai/environments/examples/red/taskset.py +77 -0
- synth_ai/environments/examples/sokoban/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/engine.py +675 -0
- synth_ai/environments/examples/sokoban/engine_helpers/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/engine_helpers/room_utils.py +656 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/__init__.py +17 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/__init__.py +3 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/boxoban_env.py +129 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/render_utils.py +370 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/room_utils.py +331 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env.py +305 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_fixed_targets.py +66 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_pull.py +114 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_two_player.py +122 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_variations.py +394 -0
- synth_ai/environments/examples/sokoban/environment.py +228 -0
- synth_ai/environments/examples/sokoban/generate_verified_puzzles.py +438 -0
- synth_ai/environments/examples/sokoban/puzzle_loader.py +311 -0
- synth_ai/environments/examples/sokoban/taskset.py +425 -0
- synth_ai/environments/examples/tictactoe/__init__.py +1 -0
- synth_ai/environments/examples/tictactoe/engine.py +368 -0
- synth_ai/environments/examples/tictactoe/environment.py +239 -0
- synth_ai/environments/examples/tictactoe/taskset.py +214 -0
- synth_ai/environments/examples/verilog/__init__.py +10 -0
- synth_ai/environments/examples/verilog/engine.py +328 -0
- synth_ai/environments/examples/verilog/environment.py +349 -0
- synth_ai/environments/examples/verilog/taskset.py +418 -0
- {synth_ai-0.2.4.dev4.dist-info → synth_ai-0.2.4.dev5.dist-info}/METADATA +1 -1
- {synth_ai-0.2.4.dev4.dist-info → synth_ai-0.2.4.dev5.dist-info}/RECORD +104 -6
- {synth_ai-0.2.4.dev4.dist-info → synth_ai-0.2.4.dev5.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.4.dev4.dist-info → synth_ai-0.2.4.dev5.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.4.dev4.dist-info → synth_ai-0.2.4.dev5.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.4.dev4.dist-info → synth_ai-0.2.4.dev5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
import sqlite3
|
2
|
+
import logging
|
3
|
+
import textwrap
|
4
|
+
from typing import List, Optional
|
5
|
+
from dataclasses import dataclass
|
6
|
+
|
7
|
+
from synth_ai.environments.environment.db.sqlite import SQLiteManager
|
8
|
+
from synth_ai.environments.examples.enron.art_helpers.types_enron import Email
|
9
|
+
|
10
|
+
# Configure logger for this module
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
if not logger.handlers: # avoid duplicate handlers in pytest -x
|
13
|
+
h = logging.StreamHandler()
|
14
|
+
h.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
15
|
+
logger.addHandler(h)
|
16
|
+
logger.setLevel(logging.DEBUG) # DEBUG so we see the raw SQL
|
17
|
+
|
18
|
+
|
19
|
+
@dataclass
|
20
|
+
class SearchResult:
|
21
|
+
message_id: str
|
22
|
+
snippet: str
|
23
|
+
score: float
|
24
|
+
|
25
|
+
|
26
|
+
def search_emails(
|
27
|
+
sqlite_manager: SQLiteManager,
|
28
|
+
inbox: str,
|
29
|
+
keywords: List[str],
|
30
|
+
from_addr: Optional[str] = None,
|
31
|
+
to_addr: Optional[str] = None,
|
32
|
+
sent_after: Optional[str] = None,
|
33
|
+
sent_before: Optional[str] = None,
|
34
|
+
max_results: int = 10,
|
35
|
+
) -> List[SearchResult]:
|
36
|
+
"""
|
37
|
+
Searches the email database based on keywords, inbox, sender, recipient, and date range.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
sqlite_manager: The SQLiteManager instance for database operations.
|
41
|
+
inbox: The email address of the user performing the search.
|
42
|
+
Results include emails sent from or to (inc. cc/bcc) this address.
|
43
|
+
keywords: A list of keywords that must all appear in the subject or body.
|
44
|
+
from_addr: Optional email address to filter emails sent *from*.
|
45
|
+
to_addr: Optional email address to filter emails sent *to* (inc. cc/bcc).
|
46
|
+
sent_after: Optional date string 'YYYY-MM-DD'. Filters for emails sent on or after this date.
|
47
|
+
sent_before: Optional date string 'YYYY-MM-DD'. Filters for emails sent before this date.
|
48
|
+
max_results: The maximum number of results to return. Cannot exceed 10.
|
49
|
+
|
50
|
+
Returns:
|
51
|
+
A list of SearchResult objects, each containing 'message_id' and 'snippet'.
|
52
|
+
Returns an empty list if no results are found or an error occurs.
|
53
|
+
"""
|
54
|
+
|
55
|
+
if not keywords:
|
56
|
+
raise ValueError("No keywords provided for search.")
|
57
|
+
if max_results > 10:
|
58
|
+
# The user snippet implies max_results isn't part of the simplified SQL here.
|
59
|
+
# Keeping the check, but the new SQL query below does not use all filters.
|
60
|
+
# This might need reconciliation if all filters are intended to be used with the new SQL.
|
61
|
+
logger.warning(
|
62
|
+
"max_results > 10, but the provided SQL snippet for logging might not respect all filters."
|
63
|
+
)
|
64
|
+
|
65
|
+
safe_keywords = [k.replace("'", "''") for k in keywords]
|
66
|
+
fts_match_query = " ".join(f'"{k}"' for k in safe_keywords)
|
67
|
+
|
68
|
+
sql_query = textwrap.dedent("""
|
69
|
+
SELECT DISTINCT
|
70
|
+
e.message_id,
|
71
|
+
snippet(emails_fts, -1, '⟪', '⟫', ' … ', 15) AS snip
|
72
|
+
FROM emails e
|
73
|
+
JOIN emails_fts ON e.id = emails_fts.rowid
|
74
|
+
WHERE emails_fts MATCH ?
|
75
|
+
LIMIT ?
|
76
|
+
""").strip()
|
77
|
+
|
78
|
+
params = (fts_match_query, max_results)
|
79
|
+
|
80
|
+
try:
|
81
|
+
with sqlite_manager.connection() as db_conn:
|
82
|
+
rows = db_conn.execute(sql_query, params).fetchall()
|
83
|
+
return [SearchResult(message_id=row[0], snippet=row[1], score=0.0) for row in rows]
|
84
|
+
except sqlite3.Error as e:
|
85
|
+
logger.error(f"Database error during search: {e}\nSQL: {sql_query}\nParams: {params}")
|
86
|
+
return []
|
87
|
+
|
88
|
+
|
89
|
+
def read_email(sqlite_manager: SQLiteManager, message_id: str) -> Optional[Email]:
|
90
|
+
"""
|
91
|
+
Retrieves a single email by its message_id from the database.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
sqlite_manager: The SQLiteManager instance for database operations.
|
95
|
+
message_id: The unique identifier of the email to retrieve.
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
An Email object containing the details of the found email,
|
99
|
+
or None if the email is not found or an error occurs.
|
100
|
+
"""
|
101
|
+
|
102
|
+
email_sql = """
|
103
|
+
SELECT id, message_id, date, subject, from_address, body, file_name
|
104
|
+
FROM emails
|
105
|
+
WHERE message_id = ?;
|
106
|
+
"""
|
107
|
+
|
108
|
+
recipients_sql = """
|
109
|
+
SELECT recipient_address, recipient_type
|
110
|
+
FROM recipients
|
111
|
+
WHERE email_id = ?;
|
112
|
+
"""
|
113
|
+
|
114
|
+
try:
|
115
|
+
with sqlite_manager.connection() as db_conn:
|
116
|
+
cursor = db_conn.cursor()
|
117
|
+
cursor.execute(email_sql, (message_id,))
|
118
|
+
email_row = cursor.fetchone()
|
119
|
+
|
120
|
+
if not email_row:
|
121
|
+
logging.warning(f"Email with message_id '{message_id}' not found.")
|
122
|
+
return None
|
123
|
+
|
124
|
+
email_id, msg_id, date, subject, from_addr, body, file_name = email_row
|
125
|
+
# Fetch recipients for this email primary key
|
126
|
+
cursor.execute(recipients_sql, (email_id,))
|
127
|
+
recipient_rows = cursor.fetchall()
|
128
|
+
except sqlite3.Error as e:
|
129
|
+
logger.error(f"Database error reading email {message_id}: {e}")
|
130
|
+
return None
|
131
|
+
|
132
|
+
to_addresses: List[str] = []
|
133
|
+
cc_addresses: List[str] = []
|
134
|
+
bcc_addresses: List[str] = []
|
135
|
+
|
136
|
+
for addr, type_val in recipient_rows:
|
137
|
+
type_lower = type_val.lower()
|
138
|
+
if type_lower == "to":
|
139
|
+
to_addresses.append(addr)
|
140
|
+
elif type_lower == "cc":
|
141
|
+
cc_addresses.append(addr)
|
142
|
+
elif type_lower == "bcc":
|
143
|
+
bcc_addresses.append(addr)
|
144
|
+
|
145
|
+
email_obj = Email(
|
146
|
+
message_id=msg_id,
|
147
|
+
date=date,
|
148
|
+
subject=subject,
|
149
|
+
from_address=from_addr,
|
150
|
+
to_addresses=to_addresses,
|
151
|
+
cc_addresses=cc_addresses,
|
152
|
+
bcc_addresses=bcc_addresses,
|
153
|
+
body=body,
|
154
|
+
file_name=file_name,
|
155
|
+
)
|
156
|
+
return email_obj
|
@@ -0,0 +1,280 @@
|
|
1
|
+
import sqlite3
|
2
|
+
import os
|
3
|
+
import logging
|
4
|
+
from datasets import load_dataset, Dataset, Features, Value, Sequence
|
5
|
+
from tqdm import tqdm
|
6
|
+
from datetime import datetime
|
7
|
+
|
8
|
+
# Resolve paths relative to this file so it works regardless of the current working directory
|
9
|
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
10
|
+
# Database will live in "../data/enron_emails.db" relative to project root
|
11
|
+
DEFAULT_DB_PATH = os.path.join(BASE_DIR, "..", "..", "data", "enron_emails.db")
|
12
|
+
|
13
|
+
DEFAULT_REPO_ID = "corbt/enron-emails"
|
14
|
+
|
15
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
16
|
+
|
17
|
+
|
18
|
+
# --- Database Schema ---
|
19
|
+
SQL_CREATE_TABLES = """
|
20
|
+
DROP TABLE IF EXISTS recipients;
|
21
|
+
DROP TABLE IF EXISTS emails_fts;
|
22
|
+
DROP TABLE IF EXISTS emails;
|
23
|
+
|
24
|
+
CREATE TABLE emails (
|
25
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
26
|
+
message_id TEXT UNIQUE,
|
27
|
+
subject TEXT,
|
28
|
+
from_address TEXT,
|
29
|
+
date TEXT, -- Store as ISO 8601 string 'YYYY-MM-DD HH:MM:SS'
|
30
|
+
body TEXT,
|
31
|
+
file_name TEXT
|
32
|
+
);
|
33
|
+
|
34
|
+
CREATE TABLE recipients (
|
35
|
+
email_id INTEGER,
|
36
|
+
recipient_address TEXT,
|
37
|
+
recipient_type TEXT, -- 'to', 'cc', 'bcc'
|
38
|
+
FOREIGN KEY(email_id) REFERENCES emails(id) ON DELETE CASCADE
|
39
|
+
);
|
40
|
+
"""
|
41
|
+
|
42
|
+
SQL_CREATE_INDEXES_TRIGGERS = """
|
43
|
+
CREATE INDEX idx_emails_from ON emails(from_address);
|
44
|
+
CREATE INDEX idx_emails_date ON emails(date);
|
45
|
+
CREATE INDEX idx_emails_message_id ON emails(message_id);
|
46
|
+
CREATE INDEX idx_recipients_address ON recipients(recipient_address);
|
47
|
+
CREATE INDEX idx_recipients_type ON recipients(recipient_type);
|
48
|
+
CREATE INDEX idx_recipients_email_id ON recipients(email_id);
|
49
|
+
CREATE INDEX idx_recipients_address_email ON recipients(recipient_address, email_id);
|
50
|
+
|
51
|
+
CREATE VIRTUAL TABLE emails_fts USING fts5(
|
52
|
+
subject,
|
53
|
+
body,
|
54
|
+
content='emails',
|
55
|
+
content_rowid='id'
|
56
|
+
);
|
57
|
+
|
58
|
+
CREATE TRIGGER emails_ai AFTER INSERT ON emails BEGIN
|
59
|
+
INSERT INTO emails_fts (rowid, subject, body)
|
60
|
+
VALUES (new.id, new.subject, new.body);
|
61
|
+
END;
|
62
|
+
|
63
|
+
CREATE TRIGGER emails_ad AFTER DELETE ON emails BEGIN
|
64
|
+
DELETE FROM emails_fts WHERE rowid=old.id;
|
65
|
+
END;
|
66
|
+
|
67
|
+
CREATE TRIGGER emails_au AFTER UPDATE ON emails BEGIN
|
68
|
+
UPDATE emails_fts SET subject=new.subject, body=new.body WHERE rowid=old.id;
|
69
|
+
END;
|
70
|
+
|
71
|
+
INSERT INTO emails_fts (rowid, subject, body) SELECT id, subject, body FROM emails;
|
72
|
+
"""
|
73
|
+
|
74
|
+
|
75
|
+
# --- Functions ---
|
76
|
+
|
77
|
+
|
78
|
+
def download_dataset(repo_id: str) -> Dataset:
|
79
|
+
"""Downloads the dataset from Hugging Face Hub."""
|
80
|
+
logging.info(f"Attempting to download dataset from Hugging Face Hub: {repo_id}")
|
81
|
+
expected_features = Features(
|
82
|
+
{
|
83
|
+
"message_id": Value("string"),
|
84
|
+
"subject": Value("string"),
|
85
|
+
"from": Value("string"),
|
86
|
+
"to": Sequence(Value("string")),
|
87
|
+
"cc": Sequence(Value("string")),
|
88
|
+
"bcc": Sequence(Value("string")),
|
89
|
+
"date": Value("timestamp[us]"),
|
90
|
+
"body": Value("string"),
|
91
|
+
"file_name": Value("string"),
|
92
|
+
}
|
93
|
+
)
|
94
|
+
dataset_obj = load_dataset(repo_id, features=expected_features, split="train")
|
95
|
+
# Basic type check remains useful
|
96
|
+
if not isinstance(dataset_obj, Dataset):
|
97
|
+
raise TypeError(f"Expected Dataset, got {type(dataset_obj)}")
|
98
|
+
logging.info(f"Successfully loaded dataset '{repo_id}' with {len(dataset_obj)} records.")
|
99
|
+
return dataset_obj
|
100
|
+
|
101
|
+
|
102
|
+
def create_database(db_path: str):
|
103
|
+
"""Creates the SQLite database and tables."""
|
104
|
+
logging.info(f"Creating SQLite database and tables at: {db_path}")
|
105
|
+
conn = sqlite3.connect(db_path)
|
106
|
+
cursor = conn.cursor()
|
107
|
+
cursor.executescript(SQL_CREATE_TABLES)
|
108
|
+
conn.commit()
|
109
|
+
conn.close()
|
110
|
+
logging.info("Database tables created successfully.")
|
111
|
+
|
112
|
+
|
113
|
+
def populate_database(db_path: str, dataset: Dataset):
|
114
|
+
"""Populates the database with data from the Hugging Face dataset."""
|
115
|
+
logging.info(f"Populating database {db_path}...")
|
116
|
+
conn = sqlite3.connect(db_path)
|
117
|
+
cursor = conn.cursor()
|
118
|
+
|
119
|
+
# --- Performance Pragmas ---
|
120
|
+
conn.execute("PRAGMA synchronous = OFF;")
|
121
|
+
conn.execute("PRAGMA journal_mode = MEMORY;")
|
122
|
+
|
123
|
+
record_count = 0
|
124
|
+
skipped_count = 0 # Keep track of skipped emails due to filters
|
125
|
+
duplicate_count = 0 # Keep track of skipped duplicate emails
|
126
|
+
processed_emails = set() # Track (subject, body, from) tuples to dedupe
|
127
|
+
|
128
|
+
conn.execute("BEGIN TRANSACTION;") # Single transaction for bulk insert
|
129
|
+
|
130
|
+
for email_data in tqdm(dataset, desc="Inserting emails"):
|
131
|
+
assert isinstance(email_data, dict)
|
132
|
+
message_id = email_data["message_id"]
|
133
|
+
subject = email_data["subject"]
|
134
|
+
from_address = email_data["from"]
|
135
|
+
date_obj: datetime = email_data["date"]
|
136
|
+
body = email_data["body"]
|
137
|
+
file_name = email_data["file_name"]
|
138
|
+
to_list_raw = email_data["to"]
|
139
|
+
cc_list_raw = email_data["cc"]
|
140
|
+
bcc_list_raw = email_data["bcc"]
|
141
|
+
|
142
|
+
# --- Data Cleaning and Filtering ---
|
143
|
+
date_str = date_obj.strftime("%Y-%m-%d %H:%M:%S")
|
144
|
+
to_list = [str(addr) for addr in to_list_raw if addr]
|
145
|
+
cc_list = [str(addr) for addr in cc_list_raw if addr]
|
146
|
+
bcc_list = [str(addr) for addr in bcc_list_raw if addr]
|
147
|
+
|
148
|
+
# Check body length
|
149
|
+
if len(body) > 5000:
|
150
|
+
logging.debug(f"Skipping email {message_id}: Body length > 5000 characters.")
|
151
|
+
skipped_count += 1
|
152
|
+
continue
|
153
|
+
|
154
|
+
# Check total recipients
|
155
|
+
total_recipients = len(to_list) + len(cc_list) + len(bcc_list)
|
156
|
+
if total_recipients > 30:
|
157
|
+
logging.debug(
|
158
|
+
f"Skipping email {message_id}: Total recipients ({total_recipients}) > 30."
|
159
|
+
)
|
160
|
+
skipped_count += 1
|
161
|
+
continue
|
162
|
+
# --- End Filtering ---
|
163
|
+
|
164
|
+
# --- Deduplication Check ---
|
165
|
+
email_key = (subject, body, from_address)
|
166
|
+
if email_key in processed_emails:
|
167
|
+
logging.debug(
|
168
|
+
f"Skipping duplicate email (Subject: {subject[:50]}..., From: {from_address})"
|
169
|
+
)
|
170
|
+
duplicate_count += 1
|
171
|
+
continue
|
172
|
+
else:
|
173
|
+
processed_emails.add(email_key)
|
174
|
+
# --- End Deduplication ---
|
175
|
+
|
176
|
+
cursor.execute(
|
177
|
+
"""
|
178
|
+
INSERT INTO emails (message_id, subject, from_address, date, body, file_name)
|
179
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
180
|
+
""",
|
181
|
+
(message_id, subject, from_address, date_str, body, file_name),
|
182
|
+
)
|
183
|
+
email_pk_id = cursor.lastrowid
|
184
|
+
|
185
|
+
recipient_data = []
|
186
|
+
for addr in to_list:
|
187
|
+
recipient_data.append((email_pk_id, addr, "to"))
|
188
|
+
for addr in cc_list:
|
189
|
+
recipient_data.append((email_pk_id, addr, "cc"))
|
190
|
+
for addr in bcc_list:
|
191
|
+
recipient_data.append((email_pk_id, addr, "bcc"))
|
192
|
+
|
193
|
+
if recipient_data:
|
194
|
+
cursor.executemany(
|
195
|
+
"""
|
196
|
+
INSERT INTO recipients (email_id, recipient_address, recipient_type)
|
197
|
+
VALUES (?, ?, ?)
|
198
|
+
""",
|
199
|
+
recipient_data,
|
200
|
+
)
|
201
|
+
record_count += 1
|
202
|
+
|
203
|
+
conn.commit()
|
204
|
+
conn.close()
|
205
|
+
logging.info(f"Successfully inserted {record_count} email records.")
|
206
|
+
if skipped_count > 0:
|
207
|
+
logging.info(f"Skipped {skipped_count} email records due to length or recipient limits.")
|
208
|
+
if duplicate_count > 0:
|
209
|
+
logging.info(
|
210
|
+
f"Skipped {duplicate_count} duplicate email records (based on subject, body, from)."
|
211
|
+
)
|
212
|
+
|
213
|
+
|
214
|
+
def create_indexes_and_triggers(db_path: str):
|
215
|
+
"""Creates indexes and triggers on the populated database."""
|
216
|
+
logging.info(f"Creating indexes and triggers for database: {db_path}...")
|
217
|
+
conn = sqlite3.connect(db_path)
|
218
|
+
cursor = conn.cursor()
|
219
|
+
cursor.executescript(SQL_CREATE_INDEXES_TRIGGERS)
|
220
|
+
conn.commit()
|
221
|
+
conn.close()
|
222
|
+
logging.info("Indexes and triggers created successfully.")
|
223
|
+
|
224
|
+
|
225
|
+
def generate_database(overwrite: bool = False):
|
226
|
+
"""
|
227
|
+
Generates the SQLite database from the specified Hugging Face dataset.
|
228
|
+
Simplified version without extensive error handling.
|
229
|
+
|
230
|
+
Args:
|
231
|
+
repo_id: The Hugging Face repository ID for the dataset.
|
232
|
+
db_path: The path where the SQLite database file should be created.
|
233
|
+
overwrite: If True, any existing database file at db_path will be removed.
|
234
|
+
"""
|
235
|
+
logging.info(
|
236
|
+
f"Starting database generation for repo '{DEFAULT_REPO_ID}' at '{DEFAULT_DB_PATH}'"
|
237
|
+
)
|
238
|
+
logging.info(f"Overwrite existing database: {overwrite}")
|
239
|
+
|
240
|
+
db_dir = os.path.dirname(DEFAULT_DB_PATH)
|
241
|
+
if db_dir and not os.path.exists(db_dir):
|
242
|
+
logging.info(f"Creating data directory: {db_dir}")
|
243
|
+
os.makedirs(db_dir)
|
244
|
+
|
245
|
+
if overwrite and os.path.exists(DEFAULT_DB_PATH):
|
246
|
+
logging.info(f"Removing existing database file: {DEFAULT_DB_PATH}")
|
247
|
+
os.remove(DEFAULT_DB_PATH)
|
248
|
+
|
249
|
+
if not os.path.exists(DEFAULT_DB_PATH):
|
250
|
+
dataset = download_dataset(DEFAULT_REPO_ID)
|
251
|
+
create_database(DEFAULT_DB_PATH)
|
252
|
+
populate_database(DEFAULT_DB_PATH, dataset)
|
253
|
+
create_indexes_and_triggers(DEFAULT_DB_PATH)
|
254
|
+
|
255
|
+
# ---- new: add unique index post-creation if not already handled by SQL_CREATE_INDEXES_TRIGGERS ---
|
256
|
+
# This ensures the index exists even if SQL_CREATE_INDEXES_TRIGGERS was modified
|
257
|
+
# or if we want to be absolutely certain this specific index is applied.
|
258
|
+
conn = sqlite3.connect(DEFAULT_DB_PATH)
|
259
|
+
cur = conn.cursor()
|
260
|
+
logging.info("Ensuring UNIQUE index on emails.message_id exists...")
|
261
|
+
cur.executescript(
|
262
|
+
"""
|
263
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_emails_message_id
|
264
|
+
ON emails(message_id);
|
265
|
+
"""
|
266
|
+
)
|
267
|
+
conn.commit()
|
268
|
+
conn.close()
|
269
|
+
logging.info("UNIQUE index on emails.message_id verified/created.")
|
270
|
+
# ---- end new section ----
|
271
|
+
else:
|
272
|
+
logging.info(
|
273
|
+
f"Database already exists at {DEFAULT_DB_PATH}. Set overwrite=True to regenerate."
|
274
|
+
)
|
275
|
+
|
276
|
+
logging.info("Database generation process complete.")
|
277
|
+
|
278
|
+
|
279
|
+
if __name__ == "__main__":
|
280
|
+
generate_database(overwrite=True)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
from typing import List, Optional
|
3
|
+
|
4
|
+
|
5
|
+
class SyntheticQuery(BaseModel):
|
6
|
+
id: int
|
7
|
+
question: str
|
8
|
+
answer: str
|
9
|
+
message_ids: List[str] # message_ids (strings) of referenced emails
|
10
|
+
how_realistic: float
|
11
|
+
inbox_address: str
|
12
|
+
query_date: str
|
13
|
+
|
14
|
+
|
15
|
+
class Email(BaseModel):
|
16
|
+
message_id: str
|
17
|
+
date: str # ISO 8601 string 'YYYY-MM-DD HH:MM:SS'
|
18
|
+
subject: Optional[str] = None
|
19
|
+
from_address: Optional[str] = None
|
20
|
+
to_addresses: List[str] = [] # Populated from recipients table
|
21
|
+
cc_addresses: List[str] = [] # Populated from recipients table
|
22
|
+
bcc_addresses: List[str] = [] # Populated from recipients table
|
23
|
+
body: Optional[str] = None
|
24
|
+
file_name: Optional[str] = None
|