email-to-calendar 20250826010803.dev0__py3-none-any.whl → 20251210163203.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +10 -12
- db.py +7 -8
- email_to_calendar-20251210163203.dev0.dist-info/METADATA +41 -0
- email_to_calendar-20251210163203.dev0.dist-info/RECORD +20 -0
- events/caldav.py +13 -3
- mail/mail_idle.py +187 -0
- main.py +120 -39
- model/email.py +61 -30
- model/event.py +63 -102
- util/ai.py +163 -0
- util/env.py +97 -0
- util/notifications.py +20 -0
- email_to_calendar-20250826010803.dev0.dist-info/METADATA +0 -25
- email_to_calendar-20250826010803.dev0.dist-info/RECORD +0 -17
- util/text.py +0 -856
- {email_to_calendar-20250826010803.dev0.dist-info → email_to_calendar-20251210163203.dev0.dist-info}/WHEEL +0 -0
- {email_to_calendar-20250826010803.dev0.dist-info → email_to_calendar-20251210163203.dev0.dist-info}/licenses/LICENSE +0 -0
- {email_to_calendar-20250826010803.dev0.dist-info → email_to_calendar-20251210163203.dev0.dist-info}/top_level.txt +0 -0
util/text.py
DELETED
|
@@ -1,856 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from datetime import datetime, date, time, timedelta
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
from typing import Optional, List, Tuple
|
|
5
|
-
from bs4 import BeautifulSoup
|
|
6
|
-
|
|
7
|
-
from src.model.event import Event
|
|
8
|
-
from src.model.email import EMail
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class ParsedEvent:
|
|
13
|
-
"""Represents a parsed calendar event"""
|
|
14
|
-
|
|
15
|
-
start_date: date
|
|
16
|
-
email: EMail
|
|
17
|
-
end_date: Optional[date] = None
|
|
18
|
-
start_time: Optional[time] = None
|
|
19
|
-
end_time: Optional[time] = None
|
|
20
|
-
summary: str = ""
|
|
21
|
-
is_all_day: bool = True
|
|
22
|
-
is_tentative: bool = False # True if date contains "or"
|
|
23
|
-
|
|
24
|
-
def __str__(self):
|
|
25
|
-
date_str = self.start_date.strftime("%Y-%m-%d")
|
|
26
|
-
if self.end_date:
|
|
27
|
-
date_str += f" to {self.end_date.strftime('%Y-%m-%d')}"
|
|
28
|
-
time_str = (
|
|
29
|
-
"All day"
|
|
30
|
-
if self.is_all_day
|
|
31
|
-
else (self.start_time.strftime("%H:%M") if self.start_time else "N/A")
|
|
32
|
-
)
|
|
33
|
-
if self.end_time:
|
|
34
|
-
time_str += f" to {self.end_time.strftime('%H:%M')}"
|
|
35
|
-
tentative_str = " (Tentative)" if self.is_tentative else ""
|
|
36
|
-
return f"{date_str} {time_str} - {self.summary}{tentative_str}"
|
|
37
|
-
|
|
38
|
-
def to_event(self):
|
|
39
|
-
"""Convert ParsedEvent to Event model instance"""
|
|
40
|
-
# Handle different event scenarios properly
|
|
41
|
-
if self.is_all_day:
|
|
42
|
-
# All-day events: start at midnight, end at 23:59:59
|
|
43
|
-
start_datetime = datetime.combine(self.start_date, time(0, 0))
|
|
44
|
-
if self.end_date:
|
|
45
|
-
# Multi-day all-day event: end at 23:59:59 of the end date
|
|
46
|
-
end_datetime = datetime.combine(self.end_date, time(23, 59, 59))
|
|
47
|
-
|
|
48
|
-
# Validate that end date is after start date
|
|
49
|
-
if end_datetime <= start_datetime:
|
|
50
|
-
# If end date is before or equal to start date, assume single day event
|
|
51
|
-
end_datetime = datetime.combine(self.start_date, time(23, 59, 59))
|
|
52
|
-
else:
|
|
53
|
-
# Single-day all-day event: end at 23:59:59 of the same day
|
|
54
|
-
end_datetime = datetime.combine(self.start_date, time(23, 59, 59))
|
|
55
|
-
else:
|
|
56
|
-
# Timed events
|
|
57
|
-
start_datetime = datetime.combine(self.start_date, self.start_time)
|
|
58
|
-
|
|
59
|
-
if self.end_date and self.end_time:
|
|
60
|
-
# Multi-day event with specific end time
|
|
61
|
-
end_datetime = datetime.combine(self.end_date, self.end_time)
|
|
62
|
-
|
|
63
|
-
# Validate that end datetime is after start datetime
|
|
64
|
-
if end_datetime <= start_datetime:
|
|
65
|
-
# If end is before start, assume single day event with 1 hour duration
|
|
66
|
-
end_datetime = start_datetime + timedelta(hours=1)
|
|
67
|
-
elif self.end_date:
|
|
68
|
-
# Multi-day event without specific end time - assume it ends at end of end date
|
|
69
|
-
proposed_end = datetime.combine(self.end_date, time(23, 59, 59))
|
|
70
|
-
|
|
71
|
-
# Validate that end date is after start date
|
|
72
|
-
if proposed_end <= start_datetime:
|
|
73
|
-
# If end date is before start, assume single day event
|
|
74
|
-
end_datetime = datetime.combine(self.start_date, time(23, 59, 59))
|
|
75
|
-
else:
|
|
76
|
-
end_datetime = proposed_end
|
|
77
|
-
elif self.end_time:
|
|
78
|
-
# Same-day event with specific end time
|
|
79
|
-
end_datetime = datetime.combine(self.start_date, self.end_time)
|
|
80
|
-
|
|
81
|
-
# Validate that end time is after start time for same-day events
|
|
82
|
-
if end_datetime <= start_datetime:
|
|
83
|
-
# If end time is before or equal to start time, try to fix it
|
|
84
|
-
# This commonly happens with AM/PM parsing issues
|
|
85
|
-
|
|
86
|
-
# If both times are in the same half of the day, add 12 hours to end time
|
|
87
|
-
if (self.start_time.hour < 12 and self.end_time.hour < 12) or \
|
|
88
|
-
(self.start_time.hour >= 12 and self.end_time.hour >= 12):
|
|
89
|
-
# Both are AM or both are PM, likely one should be PM when other is AM
|
|
90
|
-
if self.end_time.hour < 12:
|
|
91
|
-
# End time is AM, make it PM
|
|
92
|
-
fixed_end_time = time(self.end_time.hour + 12, self.end_time.minute)
|
|
93
|
-
end_datetime = datetime.combine(self.start_date, fixed_end_time)
|
|
94
|
-
|
|
95
|
-
# If it's still not fixed, just add some reasonable duration
|
|
96
|
-
if end_datetime <= start_datetime:
|
|
97
|
-
# Add 1 hour as default duration
|
|
98
|
-
end_datetime = start_datetime + timedelta(hours=1)
|
|
99
|
-
else:
|
|
100
|
-
# Same-day event with only start time - assume 1 hour duration
|
|
101
|
-
end_datetime = start_datetime + timedelta(hours=1)
|
|
102
|
-
|
|
103
|
-
# Clean the summary of any formatting before creating the event
|
|
104
|
-
clean_summary = self.strip_formatting(self.summary)
|
|
105
|
-
|
|
106
|
-
return Event(
|
|
107
|
-
start=start_datetime,
|
|
108
|
-
end=end_datetime,
|
|
109
|
-
summary=clean_summary,
|
|
110
|
-
email_id=self.email.id,
|
|
111
|
-
in_calendar=False,
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
def strip_formatting(self, text: str) -> str:
|
|
115
|
-
"""
|
|
116
|
-
Remove markdown, HTML formatting and unwanted special characters from text
|
|
117
|
-
|
|
118
|
-
Args:
|
|
119
|
-
text: Text that may contain markdown, HTML formatting, or special characters
|
|
120
|
-
|
|
121
|
-
Returns:
|
|
122
|
-
Clean text without formatting or unwanted characters
|
|
123
|
-
"""
|
|
124
|
-
if not text:
|
|
125
|
-
return text
|
|
126
|
-
|
|
127
|
-
# Remove markdown formatting
|
|
128
|
-
# Bold/italic: **text**, __text__, *text*, _text_
|
|
129
|
-
text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) # **bold**
|
|
130
|
-
text = re.sub(r"__([^_]+)__", r"\1", text) # __bold__
|
|
131
|
-
text = re.sub(r"\*([^*]+)\*", r"\1", text) # *italic*
|
|
132
|
-
text = re.sub(r"_([^_]+)_", r"\1", text) # _italic_
|
|
133
|
-
|
|
134
|
-
# Remove HTML tags if present
|
|
135
|
-
if "<" in text and ">" in text:
|
|
136
|
-
soup = BeautifulSoup(text, "html.parser")
|
|
137
|
-
text = soup.get_text()
|
|
138
|
-
|
|
139
|
-
# Remove unwanted special characters from summaries
|
|
140
|
-
# Keep only letters, numbers, spaces, and basic punctuation (periods, commas, apostrophes, parentheses)
|
|
141
|
-
# Remove problematic characters like colons, dashes, etc. that clutter summaries
|
|
142
|
-
text = re.sub(r'[><!@#$%^&*_+=\[\]{}\\|;:"\'`~-]', ' ', text).strip()
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
return text
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
class EmailEventParser:
|
|
149
|
-
"""Parser for extracting calendar events from email bodies"""
|
|
150
|
-
|
|
151
|
-
# Month names mapping
|
|
152
|
-
MONTHS = {
|
|
153
|
-
"january": 1,
|
|
154
|
-
"jan": 1,
|
|
155
|
-
"february": 2,
|
|
156
|
-
"feb": 2,
|
|
157
|
-
"march": 3,
|
|
158
|
-
"mar": 3,
|
|
159
|
-
"april": 4,
|
|
160
|
-
"apr": 4,
|
|
161
|
-
"may": 5,
|
|
162
|
-
"june": 6,
|
|
163
|
-
"jun": 6,
|
|
164
|
-
"july": 7,
|
|
165
|
-
"jul": 7,
|
|
166
|
-
"august": 8,
|
|
167
|
-
"aug": 8,
|
|
168
|
-
"september": 9,
|
|
169
|
-
"sep": 9,
|
|
170
|
-
"sept": 9,
|
|
171
|
-
"october": 10,
|
|
172
|
-
"oct": 10,
|
|
173
|
-
"november": 11,
|
|
174
|
-
"nov": 11,
|
|
175
|
-
"december": 12,
|
|
176
|
-
"dec": 12,
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
def __init__(self, delivery_date: datetime):
|
|
180
|
-
"""
|
|
181
|
-
Initialize parser with email delivery date
|
|
182
|
-
|
|
183
|
-
Args:
|
|
184
|
-
delivery_date: The date the email was delivered (used as default year)
|
|
185
|
-
"""
|
|
186
|
-
self.delivery_date = delivery_date
|
|
187
|
-
self.current_year = delivery_date.year
|
|
188
|
-
self.current_month = None
|
|
189
|
-
|
|
190
|
-
def strip_formatting(self, text: str) -> str:
|
|
191
|
-
"""
|
|
192
|
-
Remove markdown and HTML formatting from text
|
|
193
|
-
|
|
194
|
-
Args:
|
|
195
|
-
text: Text that may contain markdown or HTML formatting
|
|
196
|
-
|
|
197
|
-
Returns:
|
|
198
|
-
Clean text without formatting
|
|
199
|
-
"""
|
|
200
|
-
if not text:
|
|
201
|
-
return text
|
|
202
|
-
|
|
203
|
-
# Remove markdown formatting
|
|
204
|
-
# Bold/italic: **text**, __text__, *text*, _text_
|
|
205
|
-
text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) # **bold**
|
|
206
|
-
text = re.sub(r"__([^_]+)__", r"\1", text) # __bold__
|
|
207
|
-
text = re.sub(r"\*([^*]+)\*", r"\1", text) # *italic*
|
|
208
|
-
text = re.sub(r"_([^_]+)_", r"\1", text) # _italic_
|
|
209
|
-
|
|
210
|
-
# Remove HTML tags if present
|
|
211
|
-
if "<" in text and ">" in text:
|
|
212
|
-
soup = BeautifulSoup(text, "html.parser")
|
|
213
|
-
text = soup.get_text()
|
|
214
|
-
|
|
215
|
-
# Clean up extra whitespace
|
|
216
|
-
text = re.sub(r"\s+", " ", text).strip()
|
|
217
|
-
|
|
218
|
-
return text
|
|
219
|
-
|
|
220
|
-
def clean_html_content(self, html_content: str) -> str:
|
|
221
|
-
"""
|
|
222
|
-
Clean HTML content and convert to plain text
|
|
223
|
-
|
|
224
|
-
Args:
|
|
225
|
-
html_content: Raw HTML content from email
|
|
226
|
-
|
|
227
|
-
Returns:
|
|
228
|
-
Cleaned plain text content
|
|
229
|
-
"""
|
|
230
|
-
# Parse HTML
|
|
231
|
-
soup = BeautifulSoup(html_content, "html.parser")
|
|
232
|
-
|
|
233
|
-
# Remove script and style elements
|
|
234
|
-
for script in soup(["script", "style"]):
|
|
235
|
-
script.decompose()
|
|
236
|
-
|
|
237
|
-
# Add line breaks before certain elements to preserve structure
|
|
238
|
-
for tag in soup.find_all(["div", "br", "p"]):
|
|
239
|
-
if tag.name == "br":
|
|
240
|
-
tag.replace_with(soup.new_string("\n"))
|
|
241
|
-
else:
|
|
242
|
-
# Add newlines around block elements
|
|
243
|
-
tag.insert_before(soup.new_string("\n"))
|
|
244
|
-
tag.insert_after(soup.new_string("\n"))
|
|
245
|
-
|
|
246
|
-
# Get text content
|
|
247
|
-
text = soup.get_text()
|
|
248
|
-
|
|
249
|
-
# Clean up whitespace and line breaks
|
|
250
|
-
lines = []
|
|
251
|
-
for line in text.split("\n"):
|
|
252
|
-
line = line.strip()
|
|
253
|
-
if line:
|
|
254
|
-
lines.append(line)
|
|
255
|
-
|
|
256
|
-
return "\n".join(lines)
|
|
257
|
-
|
|
258
|
-
def parse_time(self, time_str: str) -> Optional[time]:
|
|
259
|
-
"""
|
|
260
|
-
Parse time string into time object
|
|
261
|
-
|
|
262
|
-
Args:
|
|
263
|
-
time_str: Time string (e.g., "2pm", "10:30am", "830", "2:15", "noon", "10", "2-245")
|
|
264
|
-
|
|
265
|
-
Returns:
|
|
266
|
-
Parsed time object or None if parsing fails
|
|
267
|
-
"""
|
|
268
|
-
if not time_str:
|
|
269
|
-
return None
|
|
270
|
-
|
|
271
|
-
time_str = time_str.strip().lower()
|
|
272
|
-
|
|
273
|
-
# Handle special cases
|
|
274
|
-
if time_str == "noon":
|
|
275
|
-
return time(12, 0)
|
|
276
|
-
elif time_str == "midnight":
|
|
277
|
-
return time(0, 0)
|
|
278
|
-
|
|
279
|
-
# Handle time ranges like "2-245" - extract only the start time
|
|
280
|
-
time_range_match = re.match(r"^(\d{1,2})-(\d{1,4})$", time_str)
|
|
281
|
-
if time_range_match:
|
|
282
|
-
start_time_str = time_range_match.group(1)
|
|
283
|
-
# Recursively parse the start time
|
|
284
|
-
return self.parse_time(start_time_str)
|
|
285
|
-
|
|
286
|
-
# Handle various time formats
|
|
287
|
-
time_patterns = [
|
|
288
|
-
r"^(\d{1,2}):(\d{2})\s*(am|pm)?$", # 2:30pm, 10:15
|
|
289
|
-
r"^(\d{1,2})\s*(am|pm)$", # 2pm, 10am
|
|
290
|
-
r"^(\d{3,4})\s*(am|pm)$", # 830am, 1020am (WITH am/pm required)
|
|
291
|
-
r"^(\d{3,4})ish$", # 830ish, 1020ish
|
|
292
|
-
r"^(\d{3,4})$", # 830, 1020 (without am/pm)
|
|
293
|
-
r"^(\d{1,2})$", # Single digit hours like "10", "2" (assume appropriate AM/PM)
|
|
294
|
-
]
|
|
295
|
-
|
|
296
|
-
for pattern in time_patterns:
|
|
297
|
-
match = re.match(pattern, time_str)
|
|
298
|
-
if match:
|
|
299
|
-
groups = match.groups()
|
|
300
|
-
|
|
301
|
-
if len(groups) >= 2 and ":" in time_str: # HH:MM format
|
|
302
|
-
hour, minute = int(groups[0]), int(groups[1])
|
|
303
|
-
ampm = groups[2] if len(groups) > 2 else None
|
|
304
|
-
|
|
305
|
-
# Smart AM/PM inference for times without explicit am/pm
|
|
306
|
-
if not ampm:
|
|
307
|
-
# Common appointment/event time patterns
|
|
308
|
-
if hour >= 1 and hour <= 5: # 1:30, 2:50, 4:10 likely PM
|
|
309
|
-
hour += 12
|
|
310
|
-
elif hour == 12: # 12:XX likely PM (noon hour)
|
|
311
|
-
pass # Keep as is
|
|
312
|
-
# Hours 6-11 and 13+ stay as is (morning or 24-hour format)
|
|
313
|
-
|
|
314
|
-
elif (
|
|
315
|
-
len(groups) >= 2 and groups[1] and groups[1] in ["am", "pm"]
|
|
316
|
-
): # H am/pm format (including HHMM am/pm)
|
|
317
|
-
hour_or_time_digits = groups[0]
|
|
318
|
-
ampm = groups[1]
|
|
319
|
-
|
|
320
|
-
# Check if it's a 3-4 digit time like 830am or 1020am
|
|
321
|
-
if len(hour_or_time_digits) >= 3: # HHMM format with am/pm
|
|
322
|
-
if len(hour_or_time_digits) == 3: # 830 = 8:30
|
|
323
|
-
hour, minute = (
|
|
324
|
-
int(hour_or_time_digits[0]),
|
|
325
|
-
int(hour_or_time_digits[1:]),
|
|
326
|
-
)
|
|
327
|
-
else: # 1020 = 10:20
|
|
328
|
-
hour, minute = (
|
|
329
|
-
int(hour_or_time_digits[:2]),
|
|
330
|
-
int(hour_or_time_digits[2:]),
|
|
331
|
-
)
|
|
332
|
-
else: # Single or double digit hour
|
|
333
|
-
hour, minute = int(hour_or_time_digits), 0
|
|
334
|
-
|
|
335
|
-
elif "ish" in time_str: # HHMMish format
|
|
336
|
-
time_digits = groups[0]
|
|
337
|
-
if len(time_digits) == 3: # 830 = 8:30
|
|
338
|
-
hour, minute = int(time_digits[0]), int(time_digits[1:])
|
|
339
|
-
elif len(time_digits) == 4: # 1020 = 10:20
|
|
340
|
-
hour, minute = int(time_digits[:2]), int(time_digits[2:])
|
|
341
|
-
else:
|
|
342
|
-
continue
|
|
343
|
-
|
|
344
|
-
# For "ish" times, assume reasonable defaults based on hour
|
|
345
|
-
if 6 <= hour <= 11: # Morning hours
|
|
346
|
-
ampm = "am"
|
|
347
|
-
elif 1 <= hour <= 5: # Afternoon hours
|
|
348
|
-
ampm = "pm"
|
|
349
|
-
else:
|
|
350
|
-
ampm = None # For 12 and hours >= 13, leave as is
|
|
351
|
-
|
|
352
|
-
elif len(groups[0]) >= 3: # HHMM format without am/pm
|
|
353
|
-
time_digits = groups[0]
|
|
354
|
-
if len(time_digits) == 3: # 830 = 8:30
|
|
355
|
-
hour, minute = int(time_digits[0]), int(time_digits[1:])
|
|
356
|
-
elif len(time_digits) == 4: # 1020 = 10:20
|
|
357
|
-
hour, minute = int(time_digits[:2]), int(time_digits[2:])
|
|
358
|
-
else:
|
|
359
|
-
continue
|
|
360
|
-
ampm = None
|
|
361
|
-
else: # Single digit hour (like "10", "2")
|
|
362
|
-
hour, minute = int(groups[0]), 0
|
|
363
|
-
ampm = None
|
|
364
|
-
# Smart AM/PM inference for single digit hours
|
|
365
|
-
if hour >= 1 and hour <= 5:
|
|
366
|
-
# Hours 1-5 are likely PM for appointments
|
|
367
|
-
hour += 12
|
|
368
|
-
elif hour >= 6 and hour <= 11:
|
|
369
|
-
# Hours 6-11 are likely AM
|
|
370
|
-
pass # Keep as is
|
|
371
|
-
elif hour == 12:
|
|
372
|
-
# 12 is likely PM (noon)
|
|
373
|
-
pass # Keep as is
|
|
374
|
-
# Hours >= 13 are already in 24-hour format
|
|
375
|
-
|
|
376
|
-
# Handle AM/PM (only if not already processed above)
|
|
377
|
-
if ampm == "pm" and hour != 12:
|
|
378
|
-
hour += 12
|
|
379
|
-
elif ampm == "am" and hour == 12:
|
|
380
|
-
hour = 0
|
|
381
|
-
|
|
382
|
-
# Validate hour and minute
|
|
383
|
-
if 0 <= hour <= 23 and 0 <= minute <= 59:
|
|
384
|
-
return time(hour, minute)
|
|
385
|
-
|
|
386
|
-
return None
|
|
387
|
-
|
|
388
|
-
def parse_time_range(self, time_range_str: str) -> Tuple[Optional[time], Optional[time]]:
|
|
389
|
-
"""
|
|
390
|
-
Parse time range string into start and end time objects
|
|
391
|
-
|
|
392
|
-
Args:
|
|
393
|
-
time_range_str: Time range string (e.g., "2-245" meaning 2:00-2:45)
|
|
394
|
-
|
|
395
|
-
Returns:
|
|
396
|
-
Tuple of (start_time, end_time) or (None, None) if parsing fails
|
|
397
|
-
"""
|
|
398
|
-
if not time_range_str:
|
|
399
|
-
return None, None
|
|
400
|
-
|
|
401
|
-
# Handle time ranges like "2-245"
|
|
402
|
-
time_range_match = re.match(r"^(\d{1,2})-(\d{1,4})$", time_range_str.strip())
|
|
403
|
-
if time_range_match:
|
|
404
|
-
start_str = time_range_match.group(1)
|
|
405
|
-
end_str = time_range_match.group(2)
|
|
406
|
-
|
|
407
|
-
# Parse start time
|
|
408
|
-
start_time = self.parse_time(start_str)
|
|
409
|
-
if not start_time:
|
|
410
|
-
return None, None
|
|
411
|
-
|
|
412
|
-
# Parse end time - need to handle formats like "245" meaning 2:45
|
|
413
|
-
end_time = None
|
|
414
|
-
if len(end_str) == 3: # "245" = 2:45
|
|
415
|
-
hour = int(end_str[0])
|
|
416
|
-
minute = int(end_str[1:])
|
|
417
|
-
# Use same AM/PM logic as start time
|
|
418
|
-
if start_time.hour >= 12: # Start time is PM
|
|
419
|
-
if hour < 12:
|
|
420
|
-
hour += 12
|
|
421
|
-
end_time = time(hour, minute)
|
|
422
|
-
elif len(end_str) == 4: # "1245" = 12:45
|
|
423
|
-
hour = int(end_str[:2])
|
|
424
|
-
minute = int(end_str[2:])
|
|
425
|
-
# Use same AM/PM logic as start time
|
|
426
|
-
if start_time.hour >= 12 and hour < 12: # Start is PM, end should be PM too
|
|
427
|
-
hour += 12
|
|
428
|
-
end_time = time(hour, minute)
|
|
429
|
-
else: # Single or double digit - treat as hour
|
|
430
|
-
end_time = self.parse_time(end_str)
|
|
431
|
-
|
|
432
|
-
# Additional validation: ensure end time is after start time
|
|
433
|
-
if end_time and start_time:
|
|
434
|
-
# Convert to minutes for easy comparison
|
|
435
|
-
start_minutes = start_time.hour * 60 + start_time.minute
|
|
436
|
-
end_minutes = end_time.hour * 60 + end_time.minute
|
|
437
|
-
|
|
438
|
-
# If end time is before or equal to start time, try to fix it
|
|
439
|
-
if end_minutes <= start_minutes:
|
|
440
|
-
# Try adding 12 hours to end time if it's in AM and start is in PM
|
|
441
|
-
if end_time.hour < 12 and start_time.hour >= 12:
|
|
442
|
-
fixed_end_time = time(end_time.hour + 12, end_time.minute)
|
|
443
|
-
return start_time, fixed_end_time
|
|
444
|
-
# Try adding 12 hours to end time if both are in AM but end should be PM
|
|
445
|
-
elif end_time.hour < 12 and start_time.hour < 12:
|
|
446
|
-
fixed_end_time = time(end_time.hour + 12, end_time.minute)
|
|
447
|
-
return start_time, fixed_end_time
|
|
448
|
-
# If still problematic, return None for end_time (will default to 1 hour duration)
|
|
449
|
-
else:
|
|
450
|
-
return start_time, None
|
|
451
|
-
|
|
452
|
-
return start_time, end_time
|
|
453
|
-
|
|
454
|
-
return None, None
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
def parse_date_range(
|
|
458
|
-
self, date_str: str, month: int, year: int
|
|
459
|
-
) -> Tuple[Optional[date], Optional[date]]:
|
|
460
|
-
"""
|
|
461
|
-
Parse date or date range string, including cross-month ranges
|
|
462
|
-
|
|
463
|
-
Args:
|
|
464
|
-
date_str: Date string (e.g., "15", "22-23", "8-11", "21st", "22nd-24th", "25-July 4")
|
|
465
|
-
month: Current month number
|
|
466
|
-
year: Current year
|
|
467
|
-
|
|
468
|
-
Returns:
|
|
469
|
-
Tuple of (start_date, end_date). end_date is None for single dates
|
|
470
|
-
"""
|
|
471
|
-
date_str = date_str.strip()
|
|
472
|
-
|
|
473
|
-
# Remove ordinal suffixes (st, nd, rd, th)
|
|
474
|
-
def clean_ordinal(day_str: str) -> str:
|
|
475
|
-
"""Remove ordinal suffixes from day string"""
|
|
476
|
-
day_str = day_str.strip()
|
|
477
|
-
# Match ordinal suffixes: 1st, 2nd, 3rd, 4th, 11th, 21st, etc.
|
|
478
|
-
ordinal_pattern = r"^(\d+)(?:st|nd|rd|th)$"
|
|
479
|
-
match = re.match(ordinal_pattern, day_str, re.IGNORECASE)
|
|
480
|
-
if match:
|
|
481
|
-
return match.group(1)
|
|
482
|
-
return day_str
|
|
483
|
-
|
|
484
|
-
# Handle date ranges (including cross-month like "25-July 4")
|
|
485
|
-
if "-" in date_str:
|
|
486
|
-
parts = date_str.split("-", 1) # Split only on first dash
|
|
487
|
-
if len(parts) == 2:
|
|
488
|
-
start_part = parts[0].strip()
|
|
489
|
-
end_part = parts[1].strip()
|
|
490
|
-
|
|
491
|
-
try:
|
|
492
|
-
# Parse start date (always in current month)
|
|
493
|
-
start_day = int(clean_ordinal(start_part))
|
|
494
|
-
start_date = date(year, month, start_day)
|
|
495
|
-
|
|
496
|
-
# Check if end part contains a month name (cross-month range)
|
|
497
|
-
# Pattern like "July 4" or "July4" or "July-4"
|
|
498
|
-
month_day_pattern = r"^(\w+)\s*-?\s*(\d+)(?:st|nd|rd|th)?$"
|
|
499
|
-
cross_month_match = re.match(month_day_pattern, end_part, re.IGNORECASE)
|
|
500
|
-
|
|
501
|
-
if cross_month_match:
|
|
502
|
-
# Cross-month range like "25-July 4"
|
|
503
|
-
end_month_name = cross_month_match.group(1).lower()
|
|
504
|
-
end_day_str = cross_month_match.group(2)
|
|
505
|
-
|
|
506
|
-
if end_month_name in self.MONTHS:
|
|
507
|
-
end_month = self.MONTHS[end_month_name]
|
|
508
|
-
end_day = int(clean_ordinal(end_day_str))
|
|
509
|
-
|
|
510
|
-
# Handle year transition if end month is earlier than start month
|
|
511
|
-
end_year = year
|
|
512
|
-
if end_month < month:
|
|
513
|
-
end_year += 1
|
|
514
|
-
|
|
515
|
-
end_date = date(end_year, end_month, end_day)
|
|
516
|
-
return start_date, end_date
|
|
517
|
-
else:
|
|
518
|
-
# Same-month range like "22-23"
|
|
519
|
-
end_day = int(clean_ordinal(end_part))
|
|
520
|
-
end_date = date(year, month, end_day)
|
|
521
|
-
|
|
522
|
-
# Validate that end date is after start date
|
|
523
|
-
if end_date <= start_date:
|
|
524
|
-
# If end date is before or equal to start date, assume it's next month
|
|
525
|
-
# Handle month rollover
|
|
526
|
-
next_month = month + 1
|
|
527
|
-
next_year = year
|
|
528
|
-
if next_month > 12:
|
|
529
|
-
next_month = 1
|
|
530
|
-
next_year += 1
|
|
531
|
-
|
|
532
|
-
try:
|
|
533
|
-
end_date = date(next_year, next_month, end_day)
|
|
534
|
-
except ValueError:
|
|
535
|
-
# If the day doesn't exist in next month, skip this range
|
|
536
|
-
return start_date, None
|
|
537
|
-
|
|
538
|
-
return start_date, end_date
|
|
539
|
-
|
|
540
|
-
except (ValueError, TypeError):
|
|
541
|
-
pass
|
|
542
|
-
|
|
543
|
-
# Single date (e.g., "15", "21st", "22nd")
|
|
544
|
-
try:
|
|
545
|
-
day = int(clean_ordinal(date_str))
|
|
546
|
-
return date(year, month, day), None
|
|
547
|
-
except (ValueError, TypeError):
|
|
548
|
-
pass
|
|
549
|
-
|
|
550
|
-
return None, None
|
|
551
|
-
|
|
552
|
-
def clean_event_line(self, line: str) -> str:
|
|
553
|
-
"""
|
|
554
|
-
Clean special characters and extra whitespace from event lines before parsing
|
|
555
|
-
|
|
556
|
-
Args:
|
|
557
|
-
line: Raw event line that may contain special characters
|
|
558
|
-
|
|
559
|
-
Returns:
|
|
560
|
-
Cleaned line ready for parsing
|
|
561
|
-
"""
|
|
562
|
-
if not line:
|
|
563
|
-
return line
|
|
564
|
-
|
|
565
|
-
# Remove common special characters that interfere with parsing
|
|
566
|
-
# Keep important characters like hyphens (for date ranges), colons (for times),
|
|
567
|
-
# parentheses (for notes), and basic punctuation
|
|
568
|
-
# IMPORTANT: Removed colon (:) from removal pattern to preserve time parsing
|
|
569
|
-
special_chars_to_remove = r'[><!@#$%^&*_+=\[\]{}\\|;"\'`~]'
|
|
570
|
-
|
|
571
|
-
# Replace special characters with spaces, then clean up multiple spaces
|
|
572
|
-
cleaned = re.sub(special_chars_to_remove, ' ', line)
|
|
573
|
-
|
|
574
|
-
# Clean up multiple whitespace characters
|
|
575
|
-
cleaned = re.sub(r'\s+', ' ', cleaned)
|
|
576
|
-
|
|
577
|
-
# Strip leading/trailing whitespace
|
|
578
|
-
cleaned = cleaned.strip()
|
|
579
|
-
|
|
580
|
-
return cleaned
|
|
581
|
-
|
|
582
|
-
def parse_event_line(
|
|
583
|
-
self,
|
|
584
|
-
line: str,
|
|
585
|
-
current_month: int,
|
|
586
|
-
current_year: int,
|
|
587
|
-
last_event_date: Optional[date] = None,
|
|
588
|
-
email: EMail = None,
|
|
589
|
-
) -> List[ParsedEvent]:
|
|
590
|
-
"""
|
|
591
|
-
Parse a single line that may contain one or more events
|
|
592
|
-
|
|
593
|
-
Args:
|
|
594
|
-
line: Line of text containing event information
|
|
595
|
-
current_month: Current month number
|
|
596
|
-
current_year: Current year
|
|
597
|
-
last_event_date: Date from the previous event (used when line doesn't start with a date)
|
|
598
|
-
email: EMail object for linking events
|
|
599
|
-
|
|
600
|
-
Returns:
|
|
601
|
-
List of parsed events
|
|
602
|
-
"""
|
|
603
|
-
events = []
|
|
604
|
-
line = line.strip()
|
|
605
|
-
|
|
606
|
-
if not line:
|
|
607
|
-
return events
|
|
608
|
-
|
|
609
|
-
# Clean special characters and extra whitespace before parsing
|
|
610
|
-
line = self.clean_event_line(line)
|
|
611
|
-
|
|
612
|
-
if not line: # Check again after cleaning
|
|
613
|
-
return events
|
|
614
|
-
|
|
615
|
-
# Check for tentative events (containing "or")
|
|
616
|
-
is_tentative = " or " in line.lower()
|
|
617
|
-
|
|
618
|
-
# Split on & and "and" for multiple events on same line
|
|
619
|
-
event_parts = re.split(r"\s*&\s*|\s+\s+", line, flags=re.IGNORECASE) # and
|
|
620
|
-
|
|
621
|
-
for event_part in event_parts:
|
|
622
|
-
event_part = event_part.strip()
|
|
623
|
-
if not event_part:
|
|
624
|
-
continue
|
|
625
|
-
|
|
626
|
-
# Look for date patterns at the start (including cross-month patterns like "25-July 4")
|
|
627
|
-
# Updated regex to handle cross-month patterns with month names
|
|
628
|
-
date_time_pattern = r"^(\d+(?:st|nd|rd|th)?(?:-(?:\d+(?:st|nd|rd|th)?|\w+\s*\d+(?:st|nd|rd|th)?))?)(?:\s+or\s+\d+(?:st|nd|rd|th)?(?:-(?:\d+(?:st|nd|rd|th)?|\w+\s*\d+(?:st|nd|rd|th)?))?)?\s*(.*)$"
|
|
629
|
-
match = re.match(date_time_pattern, event_part, re.IGNORECASE)
|
|
630
|
-
|
|
631
|
-
start_date = None
|
|
632
|
-
end_date = None
|
|
633
|
-
rest = event_part
|
|
634
|
-
|
|
635
|
-
if match:
|
|
636
|
-
# Event starts with a date
|
|
637
|
-
date_part, rest = match.groups()
|
|
638
|
-
|
|
639
|
-
# Handle tentative dates with "or"
|
|
640
|
-
if " or " in date_part.lower():
|
|
641
|
-
date_options = re.split(r"\s+or\s+", date_part, flags=re.IGNORECASE)
|
|
642
|
-
date_part = date_options[0] # Use first option for now
|
|
643
|
-
is_tentative = True
|
|
644
|
-
|
|
645
|
-
# Parse the date range
|
|
646
|
-
start_date, end_date = self.parse_date_range(
|
|
647
|
-
date_part, current_month, current_year
|
|
648
|
-
)
|
|
649
|
-
else:
|
|
650
|
-
# Event doesn't start with a date, use last event's date if available
|
|
651
|
-
if last_event_date:
|
|
652
|
-
start_date = last_event_date
|
|
653
|
-
# rest is the entire event_part since there's no date to strip
|
|
654
|
-
rest = event_part
|
|
655
|
-
else:
|
|
656
|
-
# No date found and no previous date to use, skip this event part
|
|
657
|
-
continue
|
|
658
|
-
|
|
659
|
-
if not start_date:
|
|
660
|
-
continue
|
|
661
|
-
|
|
662
|
-
# Extract times from anywhere in the rest of the text
|
|
663
|
-
start_time = None
|
|
664
|
-
end_time = None
|
|
665
|
-
summary = rest.strip()
|
|
666
|
-
|
|
667
|
-
# Look for time patterns throughout the text
|
|
668
|
-
time_patterns = [
|
|
669
|
-
r"\b(\d{1,2}):(\d{2})\s*(am|pm)\b", # 2:30pm, 10:15am
|
|
670
|
-
r"\b(\d{1,2})\s*(am|pm)\b", # 2pm, 10am
|
|
671
|
-
r"\b(\d{1,2}):(\d{2})\b", # 2:30, 14:15 (without am/pm)
|
|
672
|
-
r"\b(\d{3,4})\s*(am|pm)\b", # 830am, 1015am, 1020am (WITH am/pm)
|
|
673
|
-
r"\b(\d{3,4})\b(?!\s*(?:am|pm|ish))", # 830, 1015 (without am/pm, not followed by am/pm/ish)
|
|
674
|
-
r"\b(\d{3,4})ish\b", # 830ish, 1020ish
|
|
675
|
-
r"\b(noon|midnight)\b", # noon, midnight
|
|
676
|
-
r"\b(\d{1,2})-(\d{1,4})\b", # Time ranges like "2-245" (2:00-2:45) or "9-10" (9:00-10:00)
|
|
677
|
-
r"\b(\d{1,2})\b(?!\s*(?:am|pm|ish|\d))", # Single digit hours like "10" (but not followed by am/pm/ish/digits)
|
|
678
|
-
]
|
|
679
|
-
|
|
680
|
-
found_times = []
|
|
681
|
-
found_time_ranges = []
|
|
682
|
-
|
|
683
|
-
for pattern in time_patterns:
|
|
684
|
-
matches = re.finditer(pattern, rest, re.IGNORECASE)
|
|
685
|
-
for match in matches:
|
|
686
|
-
time_str = match.group(0)
|
|
687
|
-
|
|
688
|
-
# Check if this is a time range pattern
|
|
689
|
-
if re.match(r"\b(\d{1,2})-(\d{1,4})\b", time_str):
|
|
690
|
-
# Parse as time range
|
|
691
|
-
start_time_parsed, end_time_parsed = self.parse_time_range(time_str)
|
|
692
|
-
if start_time_parsed:
|
|
693
|
-
found_time_ranges.append((start_time_parsed, end_time_parsed, match.span()))
|
|
694
|
-
else:
|
|
695
|
-
# Parse as single time
|
|
696
|
-
parsed_time = self.parse_time(time_str)
|
|
697
|
-
if parsed_time:
|
|
698
|
-
# Store both the original string and parsed time for proper removal
|
|
699
|
-
found_times.append((parsed_time, match.span(), time_str))
|
|
700
|
-
|
|
701
|
-
# Handle time extraction - prioritize time ranges over individual times
|
|
702
|
-
if found_time_ranges:
|
|
703
|
-
# Use the first time range found
|
|
704
|
-
start_time, end_time = found_time_ranges[0][0], found_time_ranges[0][1]
|
|
705
|
-
# Remove the time range from summary
|
|
706
|
-
time_range_span = found_time_ranges[0][2]
|
|
707
|
-
summary = summary[:time_range_span[0]] + summary[time_range_span[1]:]
|
|
708
|
-
|
|
709
|
-
elif found_times:
|
|
710
|
-
# Sort by position in text
|
|
711
|
-
found_times.sort(key=lambda x: x[1][0])
|
|
712
|
-
|
|
713
|
-
# Take first time as start time
|
|
714
|
-
start_time = found_times[0][0]
|
|
715
|
-
|
|
716
|
-
# If there are two times, second one is end time
|
|
717
|
-
if len(found_times) >= 2:
|
|
718
|
-
end_time = found_times[1][0]
|
|
719
|
-
|
|
720
|
-
# Remove time strings from summary using safer string replacement
|
|
721
|
-
# Only remove unique time strings to avoid removing the same pattern multiple times
|
|
722
|
-
unique_time_strings = []
|
|
723
|
-
for time_info in found_times:
|
|
724
|
-
time_str = time_info[2] # Now the original string is at index 2
|
|
725
|
-
if time_str not in unique_time_strings:
|
|
726
|
-
unique_time_strings.append(time_str)
|
|
727
|
-
|
|
728
|
-
# Remove each unique time string once
|
|
729
|
-
for time_str in unique_time_strings:
|
|
730
|
-
summary = summary.replace(time_str, ' ', 1)
|
|
731
|
-
|
|
732
|
-
# Clean up extra spaces in summary
|
|
733
|
-
summary = re.sub(r"\s+", " ", summary).strip()
|
|
734
|
-
|
|
735
|
-
# Determine if it's an all-day event
|
|
736
|
-
is_all_day = start_time is None and end_time is None
|
|
737
|
-
|
|
738
|
-
# Clean summary and check if it's valid
|
|
739
|
-
clean_summary = self.strip_formatting(summary)
|
|
740
|
-
|
|
741
|
-
# Skip events with empty or formatting-only summaries
|
|
742
|
-
if not clean_summary or clean_summary in ['**', '*', '__', '_', '***', '___', '>', '>>']:
|
|
743
|
-
continue
|
|
744
|
-
|
|
745
|
-
# Additional check: Skip if the clean summary is just a month name
|
|
746
|
-
# This prevents month names from being treated as events
|
|
747
|
-
if clean_summary.lower() in self.MONTHS:
|
|
748
|
-
continue
|
|
749
|
-
|
|
750
|
-
# Create the event
|
|
751
|
-
event = ParsedEvent(
|
|
752
|
-
start_date=start_date,
|
|
753
|
-
end_date=end_date,
|
|
754
|
-
start_time=start_time,
|
|
755
|
-
end_time=end_time,
|
|
756
|
-
summary=clean_summary, # Use cleaned summary
|
|
757
|
-
is_all_day=is_all_day,
|
|
758
|
-
is_tentative=is_tentative,
|
|
759
|
-
email=email,
|
|
760
|
-
)
|
|
761
|
-
|
|
762
|
-
events.append(event)
|
|
763
|
-
|
|
764
|
-
return events
|
|
765
|
-
|
|
766
|
-
def parse_email_body(self, email_body: str, email: EMail) -> List[ParsedEvent]:
|
|
767
|
-
"""
|
|
768
|
-
Parse email body and extract all calendar events
|
|
769
|
-
|
|
770
|
-
Args:
|
|
771
|
-
email_body: Raw email body (HTML or plain text)
|
|
772
|
-
email: EMail object for linking events
|
|
773
|
-
|
|
774
|
-
Returns:
|
|
775
|
-
List of parsed calendar events
|
|
776
|
-
"""
|
|
777
|
-
# Clean HTML content
|
|
778
|
-
if "<" in email_body and ">" in email_body:
|
|
779
|
-
text_content = self.clean_html_content(email_body)
|
|
780
|
-
else:
|
|
781
|
-
text_content = email_body
|
|
782
|
-
|
|
783
|
-
lines = text_content.split("\n")
|
|
784
|
-
events = []
|
|
785
|
-
current_year = self.current_year
|
|
786
|
-
current_month = None
|
|
787
|
-
last_event_date = None # Track the last parsed date
|
|
788
|
-
|
|
789
|
-
for line in lines:
|
|
790
|
-
line = line.strip()
|
|
791
|
-
if not line:
|
|
792
|
-
continue
|
|
793
|
-
|
|
794
|
-
# Check for year
|
|
795
|
-
year_match = re.match(r"^\s*(\d{4})\s*$", line)
|
|
796
|
-
if year_match:
|
|
797
|
-
current_year = int(year_match.group(1))
|
|
798
|
-
continue
|
|
799
|
-
|
|
800
|
-
# Check for month - strip formatting first and be more restrictive
|
|
801
|
-
# Only match if it's ONLY a month name with optional formatting, no other content
|
|
802
|
-
month_match = re.match(r"^\s*([*_#]+)?(\w+)([*_#]+)?\s*$", line, re.IGNORECASE)
|
|
803
|
-
if month_match:
|
|
804
|
-
# Extract the month name without formatting
|
|
805
|
-
month_name = month_match.group(2).lower()
|
|
806
|
-
if month_name in self.MONTHS:
|
|
807
|
-
# Additional check: make sure this isn't part of a larger event description
|
|
808
|
-
# Skip if the line contains numbers, which likely indicates it's an event
|
|
809
|
-
if not re.search(r'\d', line):
|
|
810
|
-
new_month = self.MONTHS[month_name]
|
|
811
|
-
|
|
812
|
-
# Handle year increment when months loop (Dec -> Jan)
|
|
813
|
-
if current_month and new_month < current_month:
|
|
814
|
-
# Only increment year if we go from a late month to early month
|
|
815
|
-
if (
|
|
816
|
-
current_month >= 10 and new_month <= 3
|
|
817
|
-
): # Oct/Nov/Dec -> Jan/Feb/Mar
|
|
818
|
-
current_year += 1
|
|
819
|
-
|
|
820
|
-
current_month = new_month
|
|
821
|
-
continue
|
|
822
|
-
|
|
823
|
-
# Parse event line if we have a current month
|
|
824
|
-
if current_month:
|
|
825
|
-
line_events = self.parse_event_line(
|
|
826
|
-
line, current_month, current_year, last_event_date, email
|
|
827
|
-
)
|
|
828
|
-
events.extend(line_events)
|
|
829
|
-
|
|
830
|
-
# Update last_event_date with the last parsed event's date
|
|
831
|
-
if line_events:
|
|
832
|
-
last_event_date = line_events[-1].start_date
|
|
833
|
-
|
|
834
|
-
return events
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
def parse_email_events(email: EMail) -> List[Event]:
|
|
838
|
-
"""
|
|
839
|
-
Convenience function to parse email events
|
|
840
|
-
|
|
841
|
-
Args:
|
|
842
|
-
email: EMail object containing body and delivery_date
|
|
843
|
-
|
|
844
|
-
Returns:
|
|
845
|
-
List of Event model instances
|
|
846
|
-
"""
|
|
847
|
-
parser = EmailEventParser(email.delivery_date)
|
|
848
|
-
parsed_events = parser.parse_email_body(email.body, email)
|
|
849
|
-
|
|
850
|
-
# Convert ParsedEvent objects to Event objects
|
|
851
|
-
events = []
|
|
852
|
-
for parsed_event in parsed_events:
|
|
853
|
-
event = parsed_event.to_event()
|
|
854
|
-
events.append(event)
|
|
855
|
-
|
|
856
|
-
return events
|