email-to-calendar 20250826010803.dev0__py3-none-any.whl → 20251210163203.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
util/text.py DELETED
@@ -1,856 +0,0 @@
1
- import re
2
- from datetime import datetime, date, time, timedelta
3
- from dataclasses import dataclass
4
- from typing import Optional, List, Tuple
5
- from bs4 import BeautifulSoup
6
-
7
- from src.model.event import Event
8
- from src.model.email import EMail
9
-
10
-
11
- @dataclass
12
- class ParsedEvent:
13
- """Represents a parsed calendar event"""
14
-
15
- start_date: date
16
- email: EMail
17
- end_date: Optional[date] = None
18
- start_time: Optional[time] = None
19
- end_time: Optional[time] = None
20
- summary: str = ""
21
- is_all_day: bool = True
22
- is_tentative: bool = False # True if date contains "or"
23
-
24
- def __str__(self):
25
- date_str = self.start_date.strftime("%Y-%m-%d")
26
- if self.end_date:
27
- date_str += f" to {self.end_date.strftime('%Y-%m-%d')}"
28
- time_str = (
29
- "All day"
30
- if self.is_all_day
31
- else (self.start_time.strftime("%H:%M") if self.start_time else "N/A")
32
- )
33
- if self.end_time:
34
- time_str += f" to {self.end_time.strftime('%H:%M')}"
35
- tentative_str = " (Tentative)" if self.is_tentative else ""
36
- return f"{date_str} {time_str} - {self.summary}{tentative_str}"
37
-
38
- def to_event(self):
39
- """Convert ParsedEvent to Event model instance"""
40
- # Handle different event scenarios properly
41
- if self.is_all_day:
42
- # All-day events: start at midnight, end at 23:59:59
43
- start_datetime = datetime.combine(self.start_date, time(0, 0))
44
- if self.end_date:
45
- # Multi-day all-day event: end at 23:59:59 of the end date
46
- end_datetime = datetime.combine(self.end_date, time(23, 59, 59))
47
-
48
- # Validate that end date is after start date
49
- if end_datetime <= start_datetime:
50
- # If end date is before or equal to start date, assume single day event
51
- end_datetime = datetime.combine(self.start_date, time(23, 59, 59))
52
- else:
53
- # Single-day all-day event: end at 23:59:59 of the same day
54
- end_datetime = datetime.combine(self.start_date, time(23, 59, 59))
55
- else:
56
- # Timed events
57
- start_datetime = datetime.combine(self.start_date, self.start_time)
58
-
59
- if self.end_date and self.end_time:
60
- # Multi-day event with specific end time
61
- end_datetime = datetime.combine(self.end_date, self.end_time)
62
-
63
- # Validate that end datetime is after start datetime
64
- if end_datetime <= start_datetime:
65
- # If end is before start, assume single day event with 1 hour duration
66
- end_datetime = start_datetime + timedelta(hours=1)
67
- elif self.end_date:
68
- # Multi-day event without specific end time - assume it ends at end of end date
69
- proposed_end = datetime.combine(self.end_date, time(23, 59, 59))
70
-
71
- # Validate that end date is after start date
72
- if proposed_end <= start_datetime:
73
- # If end date is before start, assume single day event
74
- end_datetime = datetime.combine(self.start_date, time(23, 59, 59))
75
- else:
76
- end_datetime = proposed_end
77
- elif self.end_time:
78
- # Same-day event with specific end time
79
- end_datetime = datetime.combine(self.start_date, self.end_time)
80
-
81
- # Validate that end time is after start time for same-day events
82
- if end_datetime <= start_datetime:
83
- # If end time is before or equal to start time, try to fix it
84
- # This commonly happens with AM/PM parsing issues
85
-
86
- # If both times are in the same half of the day, add 12 hours to end time
87
- if (self.start_time.hour < 12 and self.end_time.hour < 12) or \
88
- (self.start_time.hour >= 12 and self.end_time.hour >= 12):
89
- # Both are AM or both are PM, likely one should be PM when other is AM
90
- if self.end_time.hour < 12:
91
- # End time is AM, make it PM
92
- fixed_end_time = time(self.end_time.hour + 12, self.end_time.minute)
93
- end_datetime = datetime.combine(self.start_date, fixed_end_time)
94
-
95
- # If it's still not fixed, just add some reasonable duration
96
- if end_datetime <= start_datetime:
97
- # Add 1 hour as default duration
98
- end_datetime = start_datetime + timedelta(hours=1)
99
- else:
100
- # Same-day event with only start time - assume 1 hour duration
101
- end_datetime = start_datetime + timedelta(hours=1)
102
-
103
- # Clean the summary of any formatting before creating the event
104
- clean_summary = self.strip_formatting(self.summary)
105
-
106
- return Event(
107
- start=start_datetime,
108
- end=end_datetime,
109
- summary=clean_summary,
110
- email_id=self.email.id,
111
- in_calendar=False,
112
- )
113
-
114
- def strip_formatting(self, text: str) -> str:
115
- """
116
- Remove markdown, HTML formatting and unwanted special characters from text
117
-
118
- Args:
119
- text: Text that may contain markdown, HTML formatting, or special characters
120
-
121
- Returns:
122
- Clean text without formatting or unwanted characters
123
- """
124
- if not text:
125
- return text
126
-
127
- # Remove markdown formatting
128
- # Bold/italic: **text**, __text__, *text*, _text_
129
- text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) # **bold**
130
- text = re.sub(r"__([^_]+)__", r"\1", text) # __bold__
131
- text = re.sub(r"\*([^*]+)\*", r"\1", text) # *italic*
132
- text = re.sub(r"_([^_]+)_", r"\1", text) # _italic_
133
-
134
- # Remove HTML tags if present
135
- if "<" in text and ">" in text:
136
- soup = BeautifulSoup(text, "html.parser")
137
- text = soup.get_text()
138
-
139
- # Remove unwanted special characters from summaries
140
- # Keep only letters, numbers, spaces, and basic punctuation (periods, commas, apostrophes, parentheses)
141
- # Remove problematic characters like colons, dashes, etc. that clutter summaries
142
- text = re.sub(r'[><!@#$%^&*_+=\[\]{}\\|;:"\'`~-]', ' ', text).strip()
143
-
144
-
145
- return text
146
-
147
-
148
- class EmailEventParser:
149
- """Parser for extracting calendar events from email bodies"""
150
-
151
- # Month names mapping
152
- MONTHS = {
153
- "january": 1,
154
- "jan": 1,
155
- "february": 2,
156
- "feb": 2,
157
- "march": 3,
158
- "mar": 3,
159
- "april": 4,
160
- "apr": 4,
161
- "may": 5,
162
- "june": 6,
163
- "jun": 6,
164
- "july": 7,
165
- "jul": 7,
166
- "august": 8,
167
- "aug": 8,
168
- "september": 9,
169
- "sep": 9,
170
- "sept": 9,
171
- "october": 10,
172
- "oct": 10,
173
- "november": 11,
174
- "nov": 11,
175
- "december": 12,
176
- "dec": 12,
177
- }
178
-
179
- def __init__(self, delivery_date: datetime):
180
- """
181
- Initialize parser with email delivery date
182
-
183
- Args:
184
- delivery_date: The date the email was delivered (used as default year)
185
- """
186
- self.delivery_date = delivery_date
187
- self.current_year = delivery_date.year
188
- self.current_month = None
189
-
190
- def strip_formatting(self, text: str) -> str:
191
- """
192
- Remove markdown and HTML formatting from text
193
-
194
- Args:
195
- text: Text that may contain markdown or HTML formatting
196
-
197
- Returns:
198
- Clean text without formatting
199
- """
200
- if not text:
201
- return text
202
-
203
- # Remove markdown formatting
204
- # Bold/italic: **text**, __text__, *text*, _text_
205
- text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) # **bold**
206
- text = re.sub(r"__([^_]+)__", r"\1", text) # __bold__
207
- text = re.sub(r"\*([^*]+)\*", r"\1", text) # *italic*
208
- text = re.sub(r"_([^_]+)_", r"\1", text) # _italic_
209
-
210
- # Remove HTML tags if present
211
- if "<" in text and ">" in text:
212
- soup = BeautifulSoup(text, "html.parser")
213
- text = soup.get_text()
214
-
215
- # Clean up extra whitespace
216
- text = re.sub(r"\s+", " ", text).strip()
217
-
218
- return text
219
-
220
- def clean_html_content(self, html_content: str) -> str:
221
- """
222
- Clean HTML content and convert to plain text
223
-
224
- Args:
225
- html_content: Raw HTML content from email
226
-
227
- Returns:
228
- Cleaned plain text content
229
- """
230
- # Parse HTML
231
- soup = BeautifulSoup(html_content, "html.parser")
232
-
233
- # Remove script and style elements
234
- for script in soup(["script", "style"]):
235
- script.decompose()
236
-
237
- # Add line breaks before certain elements to preserve structure
238
- for tag in soup.find_all(["div", "br", "p"]):
239
- if tag.name == "br":
240
- tag.replace_with(soup.new_string("\n"))
241
- else:
242
- # Add newlines around block elements
243
- tag.insert_before(soup.new_string("\n"))
244
- tag.insert_after(soup.new_string("\n"))
245
-
246
- # Get text content
247
- text = soup.get_text()
248
-
249
- # Clean up whitespace and line breaks
250
- lines = []
251
- for line in text.split("\n"):
252
- line = line.strip()
253
- if line:
254
- lines.append(line)
255
-
256
- return "\n".join(lines)
257
-
258
- def parse_time(self, time_str: str) -> Optional[time]:
259
- """
260
- Parse time string into time object
261
-
262
- Args:
263
- time_str: Time string (e.g., "2pm", "10:30am", "830", "2:15", "noon", "10", "2-245")
264
-
265
- Returns:
266
- Parsed time object or None if parsing fails
267
- """
268
- if not time_str:
269
- return None
270
-
271
- time_str = time_str.strip().lower()
272
-
273
- # Handle special cases
274
- if time_str == "noon":
275
- return time(12, 0)
276
- elif time_str == "midnight":
277
- return time(0, 0)
278
-
279
- # Handle time ranges like "2-245" - extract only the start time
280
- time_range_match = re.match(r"^(\d{1,2})-(\d{1,4})$", time_str)
281
- if time_range_match:
282
- start_time_str = time_range_match.group(1)
283
- # Recursively parse the start time
284
- return self.parse_time(start_time_str)
285
-
286
- # Handle various time formats
287
- time_patterns = [
288
- r"^(\d{1,2}):(\d{2})\s*(am|pm)?$", # 2:30pm, 10:15
289
- r"^(\d{1,2})\s*(am|pm)$", # 2pm, 10am
290
- r"^(\d{3,4})\s*(am|pm)$", # 830am, 1020am (WITH am/pm required)
291
- r"^(\d{3,4})ish$", # 830ish, 1020ish
292
- r"^(\d{3,4})$", # 830, 1020 (without am/pm)
293
- r"^(\d{1,2})$", # Single digit hours like "10", "2" (assume appropriate AM/PM)
294
- ]
295
-
296
- for pattern in time_patterns:
297
- match = re.match(pattern, time_str)
298
- if match:
299
- groups = match.groups()
300
-
301
- if len(groups) >= 2 and ":" in time_str: # HH:MM format
302
- hour, minute = int(groups[0]), int(groups[1])
303
- ampm = groups[2] if len(groups) > 2 else None
304
-
305
- # Smart AM/PM inference for times without explicit am/pm
306
- if not ampm:
307
- # Common appointment/event time patterns
308
- if hour >= 1 and hour <= 5: # 1:30, 2:50, 4:10 likely PM
309
- hour += 12
310
- elif hour == 12: # 12:XX likely PM (noon hour)
311
- pass # Keep as is
312
- # Hours 6-11 and 13+ stay as is (morning or 24-hour format)
313
-
314
- elif (
315
- len(groups) >= 2 and groups[1] and groups[1] in ["am", "pm"]
316
- ): # H am/pm format (including HHMM am/pm)
317
- hour_or_time_digits = groups[0]
318
- ampm = groups[1]
319
-
320
- # Check if it's a 3-4 digit time like 830am or 1020am
321
- if len(hour_or_time_digits) >= 3: # HHMM format with am/pm
322
- if len(hour_or_time_digits) == 3: # 830 = 8:30
323
- hour, minute = (
324
- int(hour_or_time_digits[0]),
325
- int(hour_or_time_digits[1:]),
326
- )
327
- else: # 1020 = 10:20
328
- hour, minute = (
329
- int(hour_or_time_digits[:2]),
330
- int(hour_or_time_digits[2:]),
331
- )
332
- else: # Single or double digit hour
333
- hour, minute = int(hour_or_time_digits), 0
334
-
335
- elif "ish" in time_str: # HHMMish format
336
- time_digits = groups[0]
337
- if len(time_digits) == 3: # 830 = 8:30
338
- hour, minute = int(time_digits[0]), int(time_digits[1:])
339
- elif len(time_digits) == 4: # 1020 = 10:20
340
- hour, minute = int(time_digits[:2]), int(time_digits[2:])
341
- else:
342
- continue
343
-
344
- # For "ish" times, assume reasonable defaults based on hour
345
- if 6 <= hour <= 11: # Morning hours
346
- ampm = "am"
347
- elif 1 <= hour <= 5: # Afternoon hours
348
- ampm = "pm"
349
- else:
350
- ampm = None # For 12 and hours >= 13, leave as is
351
-
352
- elif len(groups[0]) >= 3: # HHMM format without am/pm
353
- time_digits = groups[0]
354
- if len(time_digits) == 3: # 830 = 8:30
355
- hour, minute = int(time_digits[0]), int(time_digits[1:])
356
- elif len(time_digits) == 4: # 1020 = 10:20
357
- hour, minute = int(time_digits[:2]), int(time_digits[2:])
358
- else:
359
- continue
360
- ampm = None
361
- else: # Single digit hour (like "10", "2")
362
- hour, minute = int(groups[0]), 0
363
- ampm = None
364
- # Smart AM/PM inference for single digit hours
365
- if hour >= 1 and hour <= 5:
366
- # Hours 1-5 are likely PM for appointments
367
- hour += 12
368
- elif hour >= 6 and hour <= 11:
369
- # Hours 6-11 are likely AM
370
- pass # Keep as is
371
- elif hour == 12:
372
- # 12 is likely PM (noon)
373
- pass # Keep as is
374
- # Hours >= 13 are already in 24-hour format
375
-
376
- # Handle AM/PM (only if not already processed above)
377
- if ampm == "pm" and hour != 12:
378
- hour += 12
379
- elif ampm == "am" and hour == 12:
380
- hour = 0
381
-
382
- # Validate hour and minute
383
- if 0 <= hour <= 23 and 0 <= minute <= 59:
384
- return time(hour, minute)
385
-
386
- return None
387
-
388
- def parse_time_range(self, time_range_str: str) -> Tuple[Optional[time], Optional[time]]:
389
- """
390
- Parse time range string into start and end time objects
391
-
392
- Args:
393
- time_range_str: Time range string (e.g., "2-245" meaning 2:00-2:45)
394
-
395
- Returns:
396
- Tuple of (start_time, end_time) or (None, None) if parsing fails
397
- """
398
- if not time_range_str:
399
- return None, None
400
-
401
- # Handle time ranges like "2-245"
402
- time_range_match = re.match(r"^(\d{1,2})-(\d{1,4})$", time_range_str.strip())
403
- if time_range_match:
404
- start_str = time_range_match.group(1)
405
- end_str = time_range_match.group(2)
406
-
407
- # Parse start time
408
- start_time = self.parse_time(start_str)
409
- if not start_time:
410
- return None, None
411
-
412
- # Parse end time - need to handle formats like "245" meaning 2:45
413
- end_time = None
414
- if len(end_str) == 3: # "245" = 2:45
415
- hour = int(end_str[0])
416
- minute = int(end_str[1:])
417
- # Use same AM/PM logic as start time
418
- if start_time.hour >= 12: # Start time is PM
419
- if hour < 12:
420
- hour += 12
421
- end_time = time(hour, minute)
422
- elif len(end_str) == 4: # "1245" = 12:45
423
- hour = int(end_str[:2])
424
- minute = int(end_str[2:])
425
- # Use same AM/PM logic as start time
426
- if start_time.hour >= 12 and hour < 12: # Start is PM, end should be PM too
427
- hour += 12
428
- end_time = time(hour, minute)
429
- else: # Single or double digit - treat as hour
430
- end_time = self.parse_time(end_str)
431
-
432
- # Additional validation: ensure end time is after start time
433
- if end_time and start_time:
434
- # Convert to minutes for easy comparison
435
- start_minutes = start_time.hour * 60 + start_time.minute
436
- end_minutes = end_time.hour * 60 + end_time.minute
437
-
438
- # If end time is before or equal to start time, try to fix it
439
- if end_minutes <= start_minutes:
440
- # Try adding 12 hours to end time if it's in AM and start is in PM
441
- if end_time.hour < 12 and start_time.hour >= 12:
442
- fixed_end_time = time(end_time.hour + 12, end_time.minute)
443
- return start_time, fixed_end_time
444
- # Try adding 12 hours to end time if both are in AM but end should be PM
445
- elif end_time.hour < 12 and start_time.hour < 12:
446
- fixed_end_time = time(end_time.hour + 12, end_time.minute)
447
- return start_time, fixed_end_time
448
- # If still problematic, return None for end_time (will default to 1 hour duration)
449
- else:
450
- return start_time, None
451
-
452
- return start_time, end_time
453
-
454
- return None, None
455
-
456
-
457
- def parse_date_range(
458
- self, date_str: str, month: int, year: int
459
- ) -> Tuple[Optional[date], Optional[date]]:
460
- """
461
- Parse date or date range string, including cross-month ranges
462
-
463
- Args:
464
- date_str: Date string (e.g., "15", "22-23", "8-11", "21st", "22nd-24th", "25-July 4")
465
- month: Current month number
466
- year: Current year
467
-
468
- Returns:
469
- Tuple of (start_date, end_date). end_date is None for single dates
470
- """
471
- date_str = date_str.strip()
472
-
473
- # Remove ordinal suffixes (st, nd, rd, th)
474
- def clean_ordinal(day_str: str) -> str:
475
- """Remove ordinal suffixes from day string"""
476
- day_str = day_str.strip()
477
- # Match ordinal suffixes: 1st, 2nd, 3rd, 4th, 11th, 21st, etc.
478
- ordinal_pattern = r"^(\d+)(?:st|nd|rd|th)$"
479
- match = re.match(ordinal_pattern, day_str, re.IGNORECASE)
480
- if match:
481
- return match.group(1)
482
- return day_str
483
-
484
- # Handle date ranges (including cross-month like "25-July 4")
485
- if "-" in date_str:
486
- parts = date_str.split("-", 1) # Split only on first dash
487
- if len(parts) == 2:
488
- start_part = parts[0].strip()
489
- end_part = parts[1].strip()
490
-
491
- try:
492
- # Parse start date (always in current month)
493
- start_day = int(clean_ordinal(start_part))
494
- start_date = date(year, month, start_day)
495
-
496
- # Check if end part contains a month name (cross-month range)
497
- # Pattern like "July 4" or "July4" or "July-4"
498
- month_day_pattern = r"^(\w+)\s*-?\s*(\d+)(?:st|nd|rd|th)?$"
499
- cross_month_match = re.match(month_day_pattern, end_part, re.IGNORECASE)
500
-
501
- if cross_month_match:
502
- # Cross-month range like "25-July 4"
503
- end_month_name = cross_month_match.group(1).lower()
504
- end_day_str = cross_month_match.group(2)
505
-
506
- if end_month_name in self.MONTHS:
507
- end_month = self.MONTHS[end_month_name]
508
- end_day = int(clean_ordinal(end_day_str))
509
-
510
- # Handle year transition if end month is earlier than start month
511
- end_year = year
512
- if end_month < month:
513
- end_year += 1
514
-
515
- end_date = date(end_year, end_month, end_day)
516
- return start_date, end_date
517
- else:
518
- # Same-month range like "22-23"
519
- end_day = int(clean_ordinal(end_part))
520
- end_date = date(year, month, end_day)
521
-
522
- # Validate that end date is after start date
523
- if end_date <= start_date:
524
- # If end date is before or equal to start date, assume it's next month
525
- # Handle month rollover
526
- next_month = month + 1
527
- next_year = year
528
- if next_month > 12:
529
- next_month = 1
530
- next_year += 1
531
-
532
- try:
533
- end_date = date(next_year, next_month, end_day)
534
- except ValueError:
535
- # If the day doesn't exist in next month, skip this range
536
- return start_date, None
537
-
538
- return start_date, end_date
539
-
540
- except (ValueError, TypeError):
541
- pass
542
-
543
- # Single date (e.g., "15", "21st", "22nd")
544
- try:
545
- day = int(clean_ordinal(date_str))
546
- return date(year, month, day), None
547
- except (ValueError, TypeError):
548
- pass
549
-
550
- return None, None
551
-
552
- def clean_event_line(self, line: str) -> str:
553
- """
554
- Clean special characters and extra whitespace from event lines before parsing
555
-
556
- Args:
557
- line: Raw event line that may contain special characters
558
-
559
- Returns:
560
- Cleaned line ready for parsing
561
- """
562
- if not line:
563
- return line
564
-
565
- # Remove common special characters that interfere with parsing
566
- # Keep important characters like hyphens (for date ranges), colons (for times),
567
- # parentheses (for notes), and basic punctuation
568
- # IMPORTANT: Removed colon (:) from removal pattern to preserve time parsing
569
- special_chars_to_remove = r'[><!@#$%^&*_+=\[\]{}\\|;"\'`~]'
570
-
571
- # Replace special characters with spaces, then clean up multiple spaces
572
- cleaned = re.sub(special_chars_to_remove, ' ', line)
573
-
574
- # Clean up multiple whitespace characters
575
- cleaned = re.sub(r'\s+', ' ', cleaned)
576
-
577
- # Strip leading/trailing whitespace
578
- cleaned = cleaned.strip()
579
-
580
- return cleaned
581
-
582
- def parse_event_line(
583
- self,
584
- line: str,
585
- current_month: int,
586
- current_year: int,
587
- last_event_date: Optional[date] = None,
588
- email: EMail = None,
589
- ) -> List[ParsedEvent]:
590
- """
591
- Parse a single line that may contain one or more events
592
-
593
- Args:
594
- line: Line of text containing event information
595
- current_month: Current month number
596
- current_year: Current year
597
- last_event_date: Date from the previous event (used when line doesn't start with a date)
598
- email: EMail object for linking events
599
-
600
- Returns:
601
- List of parsed events
602
- """
603
- events = []
604
- line = line.strip()
605
-
606
- if not line:
607
- return events
608
-
609
- # Clean special characters and extra whitespace before parsing
610
- line = self.clean_event_line(line)
611
-
612
- if not line: # Check again after cleaning
613
- return events
614
-
615
- # Check for tentative events (containing "or")
616
- is_tentative = " or " in line.lower()
617
-
618
- # Split on & and "and" for multiple events on same line
619
- event_parts = re.split(r"\s*&\s*|\s+\s+", line, flags=re.IGNORECASE) # and
620
-
621
- for event_part in event_parts:
622
- event_part = event_part.strip()
623
- if not event_part:
624
- continue
625
-
626
- # Look for date patterns at the start (including cross-month patterns like "25-July 4")
627
- # Updated regex to handle cross-month patterns with month names
628
- date_time_pattern = r"^(\d+(?:st|nd|rd|th)?(?:-(?:\d+(?:st|nd|rd|th)?|\w+\s*\d+(?:st|nd|rd|th)?))?)(?:\s+or\s+\d+(?:st|nd|rd|th)?(?:-(?:\d+(?:st|nd|rd|th)?|\w+\s*\d+(?:st|nd|rd|th)?))?)?\s*(.*)$"
629
- match = re.match(date_time_pattern, event_part, re.IGNORECASE)
630
-
631
- start_date = None
632
- end_date = None
633
- rest = event_part
634
-
635
- if match:
636
- # Event starts with a date
637
- date_part, rest = match.groups()
638
-
639
- # Handle tentative dates with "or"
640
- if " or " in date_part.lower():
641
- date_options = re.split(r"\s+or\s+", date_part, flags=re.IGNORECASE)
642
- date_part = date_options[0] # Use first option for now
643
- is_tentative = True
644
-
645
- # Parse the date range
646
- start_date, end_date = self.parse_date_range(
647
- date_part, current_month, current_year
648
- )
649
- else:
650
- # Event doesn't start with a date, use last event's date if available
651
- if last_event_date:
652
- start_date = last_event_date
653
- # rest is the entire event_part since there's no date to strip
654
- rest = event_part
655
- else:
656
- # No date found and no previous date to use, skip this event part
657
- continue
658
-
659
- if not start_date:
660
- continue
661
-
662
- # Extract times from anywhere in the rest of the text
663
- start_time = None
664
- end_time = None
665
- summary = rest.strip()
666
-
667
- # Look for time patterns throughout the text
668
- time_patterns = [
669
- r"\b(\d{1,2}):(\d{2})\s*(am|pm)\b", # 2:30pm, 10:15am
670
- r"\b(\d{1,2})\s*(am|pm)\b", # 2pm, 10am
671
- r"\b(\d{1,2}):(\d{2})\b", # 2:30, 14:15 (without am/pm)
672
- r"\b(\d{3,4})\s*(am|pm)\b", # 830am, 1015am, 1020am (WITH am/pm)
673
- r"\b(\d{3,4})\b(?!\s*(?:am|pm|ish))", # 830, 1015 (without am/pm, not followed by am/pm/ish)
674
- r"\b(\d{3,4})ish\b", # 830ish, 1020ish
675
- r"\b(noon|midnight)\b", # noon, midnight
676
- r"\b(\d{1,2})-(\d{1,4})\b", # Time ranges like "2-245" (2:00-2:45) or "9-10" (9:00-10:00)
677
- r"\b(\d{1,2})\b(?!\s*(?:am|pm|ish|\d))", # Single digit hours like "10" (but not followed by am/pm/ish/digits)
678
- ]
679
-
680
- found_times = []
681
- found_time_ranges = []
682
-
683
- for pattern in time_patterns:
684
- matches = re.finditer(pattern, rest, re.IGNORECASE)
685
- for match in matches:
686
- time_str = match.group(0)
687
-
688
- # Check if this is a time range pattern
689
- if re.match(r"\b(\d{1,2})-(\d{1,4})\b", time_str):
690
- # Parse as time range
691
- start_time_parsed, end_time_parsed = self.parse_time_range(time_str)
692
- if start_time_parsed:
693
- found_time_ranges.append((start_time_parsed, end_time_parsed, match.span()))
694
- else:
695
- # Parse as single time
696
- parsed_time = self.parse_time(time_str)
697
- if parsed_time:
698
- # Store both the original string and parsed time for proper removal
699
- found_times.append((parsed_time, match.span(), time_str))
700
-
701
- # Handle time extraction - prioritize time ranges over individual times
702
- if found_time_ranges:
703
- # Use the first time range found
704
- start_time, end_time = found_time_ranges[0][0], found_time_ranges[0][1]
705
- # Remove the time range from summary
706
- time_range_span = found_time_ranges[0][2]
707
- summary = summary[:time_range_span[0]] + summary[time_range_span[1]:]
708
-
709
- elif found_times:
710
- # Sort by position in text
711
- found_times.sort(key=lambda x: x[1][0])
712
-
713
- # Take first time as start time
714
- start_time = found_times[0][0]
715
-
716
- # If there are two times, second one is end time
717
- if len(found_times) >= 2:
718
- end_time = found_times[1][0]
719
-
720
- # Remove time strings from summary using safer string replacement
721
- # Only remove unique time strings to avoid removing the same pattern multiple times
722
- unique_time_strings = []
723
- for time_info in found_times:
724
- time_str = time_info[2] # Now the original string is at index 2
725
- if time_str not in unique_time_strings:
726
- unique_time_strings.append(time_str)
727
-
728
- # Remove each unique time string once
729
- for time_str in unique_time_strings:
730
- summary = summary.replace(time_str, ' ', 1)
731
-
732
- # Clean up extra spaces in summary
733
- summary = re.sub(r"\s+", " ", summary).strip()
734
-
735
- # Determine if it's an all-day event
736
- is_all_day = start_time is None and end_time is None
737
-
738
- # Clean summary and check if it's valid
739
- clean_summary = self.strip_formatting(summary)
740
-
741
- # Skip events with empty or formatting-only summaries
742
- if not clean_summary or clean_summary in ['**', '*', '__', '_', '***', '___', '>', '>>']:
743
- continue
744
-
745
- # Additional check: Skip if the clean summary is just a month name
746
- # This prevents month names from being treated as events
747
- if clean_summary.lower() in self.MONTHS:
748
- continue
749
-
750
- # Create the event
751
- event = ParsedEvent(
752
- start_date=start_date,
753
- end_date=end_date,
754
- start_time=start_time,
755
- end_time=end_time,
756
- summary=clean_summary, # Use cleaned summary
757
- is_all_day=is_all_day,
758
- is_tentative=is_tentative,
759
- email=email,
760
- )
761
-
762
- events.append(event)
763
-
764
- return events
765
-
766
- def parse_email_body(self, email_body: str, email: EMail) -> List[ParsedEvent]:
767
- """
768
- Parse email body and extract all calendar events
769
-
770
- Args:
771
- email_body: Raw email body (HTML or plain text)
772
- email: EMail object for linking events
773
-
774
- Returns:
775
- List of parsed calendar events
776
- """
777
- # Clean HTML content
778
- if "<" in email_body and ">" in email_body:
779
- text_content = self.clean_html_content(email_body)
780
- else:
781
- text_content = email_body
782
-
783
- lines = text_content.split("\n")
784
- events = []
785
- current_year = self.current_year
786
- current_month = None
787
- last_event_date = None # Track the last parsed date
788
-
789
- for line in lines:
790
- line = line.strip()
791
- if not line:
792
- continue
793
-
794
- # Check for year
795
- year_match = re.match(r"^\s*(\d{4})\s*$", line)
796
- if year_match:
797
- current_year = int(year_match.group(1))
798
- continue
799
-
800
- # Check for month - strip formatting first and be more restrictive
801
- # Only match if it's ONLY a month name with optional formatting, no other content
802
- month_match = re.match(r"^\s*([*_#]+)?(\w+)([*_#]+)?\s*$", line, re.IGNORECASE)
803
- if month_match:
804
- # Extract the month name without formatting
805
- month_name = month_match.group(2).lower()
806
- if month_name in self.MONTHS:
807
- # Additional check: make sure this isn't part of a larger event description
808
- # Skip if the line contains numbers, which likely indicates it's an event
809
- if not re.search(r'\d', line):
810
- new_month = self.MONTHS[month_name]
811
-
812
- # Handle year increment when months loop (Dec -> Jan)
813
- if current_month and new_month < current_month:
814
- # Only increment year if we go from a late month to early month
815
- if (
816
- current_month >= 10 and new_month <= 3
817
- ): # Oct/Nov/Dec -> Jan/Feb/Mar
818
- current_year += 1
819
-
820
- current_month = new_month
821
- continue
822
-
823
- # Parse event line if we have a current month
824
- if current_month:
825
- line_events = self.parse_event_line(
826
- line, current_month, current_year, last_event_date, email
827
- )
828
- events.extend(line_events)
829
-
830
- # Update last_event_date with the last parsed event's date
831
- if line_events:
832
- last_event_date = line_events[-1].start_date
833
-
834
- return events
835
-
836
-
837
- def parse_email_events(email: EMail) -> List[Event]:
838
- """
839
- Convenience function to parse email events
840
-
841
- Args:
842
- email: EMail object containing body and delivery_date
843
-
844
- Returns:
845
- List of Event model instances
846
- """
847
- parser = EmailEventParser(email.delivery_date)
848
- parsed_events = parser.parse_email_body(email.body, email)
849
-
850
- # Convert ParsedEvent objects to Event objects
851
- events = []
852
- for parsed_event in parsed_events:
853
- event = parsed_event.to_event()
854
- events.append(event)
855
-
856
- return events