epstein-files 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,167 @@
1
+ import json
2
+ import re
3
+ from dataclasses import asdict, dataclass, field
4
+
5
+ from epstein_files.util.constant.strings import AUTHOR, REDACTED
6
+ from epstein_files.util.constants import ALL_CONFIGS
7
+ from epstein_files.util.env import logger
8
+ from epstein_files.util.file_cfg import MessageCfg
9
+ from epstein_files.util.rich import UNKNOWN
10
+
11
+ FIELD_NAMES = ['From', 'Date', 'Sent', 'Subject']
12
+ NON_HEADER_FIELDS = ['field_names', 'num_header_rows', 'was_initially_empty']
13
+ ON_BEHALF_OF = 'on behalf of'
14
+ TO_FIELDS = ['bcc', 'cc', 'to']
15
+ EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
16
+
17
+ HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
18
+ EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
19
+ EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
20
+ EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL) # Match up to the next email header section
21
+ TIME_REGEX = re.compile(r'^(\d{1,2}/\d{1,2}/\d{2,4}|Thursday|Monday|Tuesday|Wednesday|Friday|Saturday|Sunday).*')
22
+
23
+ BAD_NAME_CHARS_REGEX = re.compile(r"[\"'\[\]*><•]")
24
+ BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|rt|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)).*)$', re.IGNORECASE)
25
+
26
+ CONFIGURED_ACTUAL_TEXTS = [
27
+ cfg.actual_text for cfg in ALL_CONFIGS
28
+ if isinstance(cfg, MessageCfg) and cfg.actual_text is not None
29
+ ]
30
+
31
+
32
+ @dataclass(kw_only=True)
33
+ class EmailHeader:
34
+ field_names: list[str] # Order is same as the order header fields appear in the email file text
35
+ num_header_rows: int = field(init=False)
36
+ was_initially_empty: bool = False
37
+
38
+ # Fields from the email text
39
+ author: str | None = None
40
+ sent_at: str | None = None
41
+ subject: str | None = None
42
+ bcc: list[str] | None = None
43
+ cc: list[str] | None = None
44
+ importance: str | None = None
45
+ attachments: str | None = None
46
+ to: list[str] | None = None
47
+
48
+ def __post_init__(self):
49
+ self.num_header_rows = len(self.field_names)
50
+ self.was_initially_empty = self.is_empty()
51
+
52
+ def as_dict(self) -> dict[str, str | None]:
53
+ """Remove housekeeping fields that don't actually come from the email."""
54
+ return {k: v for k, v in asdict(self).items() if k not in NON_HEADER_FIELDS}
55
+
56
+ def is_empty(self) -> bool:
57
+ return not any([v for _k, v in self.as_dict().items()])
58
+
59
+ def recipients(self) -> list[str]:
60
+ return (self.to or []) + (self.cc or []) + (self.bcc or [])
61
+
62
+ def repair_empty_header(self, email_lines: list[str]) -> None:
63
+ num_headers = len(self.field_names)
64
+
65
+ # Sometimes the headers and values are on separate lines and we need to do some shenanigans
66
+ for i, field_name in enumerate(self.field_names):
67
+ row_number_to_check = i + num_headers # Look ahead 3 lines if there's 3 header fields, 4 if 4, etc.
68
+
69
+ if row_number_to_check > (len(email_lines) - 1):
70
+ raise RuntimeError(f"Ran out of header rows to check for '{field_name}'")
71
+
72
+ value = email_lines[row_number_to_check]
73
+ log_prefix = f"Looks like '{value}' is a mismatch for '{field_name}', "
74
+
75
+ if field_name == AUTHOR:
76
+ if value in CONFIGURED_ACTUAL_TEXTS:
77
+ logger.info(f"{log_prefix}, trying the next line...")
78
+ num_headers += 1
79
+ value = email_lines[i + num_headers]
80
+ elif TIME_REGEX.match(value) or value == 'Darren,' or BAD_EMAILER_REGEX.match(value):
81
+ logger.info(f"{log_prefix}, decrementing num_headers and skipping...")
82
+ num_headers -= 1
83
+ continue
84
+ elif field_name in TO_FIELDS:
85
+ if TIME_REGEX.match(value):
86
+ logger.info(f"{log_prefix}, trying next line...")
87
+ num_headers += 1
88
+ value = email_lines[i + num_headers]
89
+ elif BAD_EMAILER_REGEX.match(value):
90
+ logger.info(f"{log_prefix}, decrementing num_headers and skipping...")
91
+ num_headers -= 1
92
+ continue
93
+ elif value.startswith('http'):
94
+ logger.info(f"{log_prefix}, using empty string instead...")
95
+ value = ''
96
+
97
+ value = [v.strip() for v in value.split(';') if len(v.strip()) > 0]
98
+
99
+ setattr(self, field_name, value)
100
+
101
+ self.num_header_rows = len(self.field_names) + num_headers
102
+ logger.debug(f"Corrected empty header using {self.num_header_rows} lines to:\n%s\n\nTop lines:\n\n%s", self, '\n'.join(email_lines[0:(num_headers + 1) * 2]))
103
+
104
+ def rewrite_header(self) -> str:
105
+ header_fields = {}
106
+
107
+ for field_name in self.field_names:
108
+ if field_name == AUTHOR:
109
+ header_fields['From'] = self.author or ''
110
+ elif field_name == 'sent_at':
111
+ if self.sent_at in CONFIGURED_ACTUAL_TEXTS:
112
+ header_fields['Date'] = ''
113
+ else:
114
+ header_fields['Date'] = self.sent_at or ''
115
+ elif field_name in TO_FIELDS:
116
+ header_fields[field_name.title()] = '; '.join(getattr(self, field_name) or [])
117
+ else:
118
+ header_fields[field_name.title()] = getattr(self, field_name) or ''
119
+
120
+ return '\n'.join([f"{k}: {v}" for k, v in header_fields.items()])
121
+
122
+ def __str__(self) -> str:
123
+ return json.dumps(self.as_dict(), sort_keys=True, indent=4)
124
+
125
+ @classmethod
126
+ def from_header_lines(cls, header: str) -> 'EmailHeader':
127
+ kw_args = {}
128
+ field_names = []
129
+ should_log_header = False
130
+
131
+ for line in [l.strip() for l in header.strip().split('\n')]:
132
+ if line.lower().startswith(ON_BEHALF_OF):
133
+ author = line.removeprefix(ON_BEHALF_OF).strip()
134
+
135
+ if len(author) > 0:
136
+ kw_args[AUTHOR] = author
137
+
138
+ continue
139
+
140
+ #logger.debug(f"extracting header line: '{line}'")
141
+ key, value = [element.strip() for element in line.split(':', 1)]
142
+ value = value.rstrip('_')
143
+ key = AUTHOR if key == 'From' else ('sent_at' if key in ['Date', 'Sent'] else key.lower())
144
+ key = 'bcc' if key == 'bee' else key
145
+
146
+ if kw_args.get(key):
147
+ logger.debug(f'Already have value "{kw_args[key]}" at key "{key}", not overwriting with "{value}"')
148
+ should_log_header = True
149
+ continue
150
+
151
+ field_names.append(key)
152
+
153
+ if key in TO_FIELDS:
154
+ recipients = [element.strip() for element in value.split(';')]
155
+ recipients = [r for r in recipients if len(r) > 0]
156
+ kw_args[key] = None if len(value) == 0 else [r if len(r) > 0 else UNKNOWN for r in recipients]
157
+ else:
158
+ kw_args[key.lower()] = None if len(value) == 0 else value
159
+
160
+ if should_log_header:
161
+ logger.debug(f"Header being parsed was this:\n\n{header}\n")
162
+
163
+ return EmailHeader(field_names=field_names, **kw_args)
164
+
165
+ @staticmethod
166
+ def cleanup_str(_str: str) -> str:
167
+ return BAD_NAME_CHARS_REGEX.sub('', _str.replace(REDACTED, '')).strip().strip('_').strip()
@@ -0,0 +1,93 @@
1
+ import re
2
+ from dataclasses import dataclass, field
3
+ from datetime import datetime
4
+
5
+ from rich.text import Text
6
+
7
+ from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, STEVE_BANNON, UNKNOWN
8
+ from epstein_files.util.data import extract_last_name
9
+ from epstein_files.util.highlighted_group import get_style_for_name
10
+ from epstein_files.util.rich import TEXT_LINK, highlighter, logger
11
+
12
+ MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
13
+ PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
14
+ TIMESTAMP_STYLE = 'turquoise4 dim'
15
+
16
+ DISPLAY_LAST_NAME_ONLY = [
17
+ JEFFREY_EPSTEIN,
18
+ STEVE_BANNON,
19
+ ]
20
+
21
+ UNKNOWN_TEXTERS = [
22
+ '+16463880059',
23
+ '+13108737937',
24
+ '+13108802851',
25
+ ]
26
+
27
+ TEXTER_MAPPING = {
28
+ 'e:': JEFFREY_EPSTEIN,
29
+ 'e:jeeitunes@gmail.com': JEFFREY_EPSTEIN,
30
+ '+19174393646': ANTHONY_SCARAMUCCI,
31
+ '+13109906526': STEVE_BANNON,
32
+ }
33
+
34
+
35
+ @dataclass(kw_only=True)
36
+ class TextMessage:
37
+ """Class representing a single iMessage text message."""
38
+ author: str | None
39
+ author_str: str = field(init=False)
40
+ id_confirmed: bool = False
41
+ text: str
42
+ timestamp_str: str
43
+
44
+ def __post_init__(self):
45
+ self.author = TEXTER_MAPPING.get(self.author or UNKNOWN, self.author)
46
+
47
+ if self.author is None:
48
+ self.author_str = UNKNOWN
49
+ elif self.author in UNKNOWN_TEXTERS:
50
+ logger.warning(f"Bad text from '{self.author}': \"{self.text}\"")
51
+ self.author_str = self.author
52
+ self.author = None # TODO: this shouldn't be happening; we still know the author...
53
+ elif self.author in DISPLAY_LAST_NAME_ONLY:
54
+ self.author_str = extract_last_name(self.author)
55
+ else:
56
+ self.author_str = self.author
57
+
58
+ if not self.id_confirmed and self.author is not None:
59
+ self.author_str = self.author + ' (?)'
60
+
61
+ def timestamp(self) -> datetime:
62
+ return datetime.strptime(self.timestamp_str, MSG_DATE_FORMAT)
63
+
64
+ def _message(self) -> Text:
65
+ lines = self.text.split('\n')
66
+
67
+ # Fix multiline links
68
+ if self.text.startswith('http'):
69
+ text = self.text
70
+
71
+ if len(lines) > 1 and not lines[0].endswith('html'):
72
+ if len(lines) > 2 and lines[1].endswith('-'):
73
+ text = text.replace('\n', '', 2)
74
+ else:
75
+ text = text.replace('\n', '', 1)
76
+
77
+ lines = text.split('\n')
78
+ link_text = lines.pop()
79
+ msg_txt = Text('').append(Text.from_markup(f"[link={link_text}]{link_text}[/link]", style=TEXT_LINK))
80
+
81
+ if len(lines) > 0:
82
+ msg_txt.append('\n' + ' '.join(lines))
83
+ else:
84
+ msg_txt = highlighter(' '.join(lines)) # remove newlines
85
+
86
+ return msg_txt
87
+
88
+ def __rich__(self) -> Text:
89
+ # TODO: Workaround for phone numbers that sucks
90
+ author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
91
+ author_txt = Text(self.author_str, style=author_style)
92
+ timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_STYLE).append(' ')
93
+ return Text('').append(timestamp_txt).append(author_txt).append(': ', style='dim').append(self._message())
@@ -0,0 +1,23 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+ from rich.text import Text
5
+
6
+ from epstein_files.documents.other_file import OtherFile
7
+
8
+
9
+ @dataclass
10
+ class JsonFile(OtherFile):
11
+ """File containing JSON data."""
12
+
13
+ def __post_init__(self):
14
+ super().__post_init__()
15
+
16
+ if self.url_slug.endswith('.txt') or self.url_slug.endswith('.json'):
17
+ self.url_slug = Path(self.url_slug).stem
18
+
19
+ def info_txt(self) -> Text | None:
20
+ return Text(f"JSON file, possibly iMessage or similar app metadata", style='white dim italic')
21
+
22
+ def is_interesting(self):
23
+ return False
@@ -0,0 +1,73 @@
1
+ import re
2
+ from dataclasses import dataclass, field
3
+ from datetime import datetime
4
+
5
+ from rich.console import Console, ConsoleOptions, RenderResult
6
+ from rich.text import Text
7
+
8
+ from epstein_files.documents.communication import Communication
9
+ from epstein_files.documents.imessage.text_message import MSG_DATE_FORMAT, TextMessage
10
+ from epstein_files.util.rich import logger
11
+
12
+ CONFIRMED_MSG = 'Found confirmed counterparty'
13
+ GUESSED_MSG = 'This is probably a conversation with'
14
+ MSG_REGEX = re.compile(r'Sender:(.*?)\nTime:(.*? (AM|PM)).*?Message:(.*?)\s*?((?=(\nSender)|\Z))', re.DOTALL)
15
+ REDACTED_AUTHOR_REGEX = re.compile(r"^([-+•_1MENO.=F]+|[4Ide])$")
16
+
17
+
18
+ @dataclass
19
+ class MessengerLog(Communication):
20
+ """Class representing one iMessage log file (one conversation between Epstein and some counterparty)."""
21
+ _messages: list[TextMessage] = field(default_factory=list)
22
+
23
+ def first_message_at(self, name: str | None) -> datetime:
24
+ return self.messages_by(name)[0].timestamp()
25
+
26
+ def info_txt(self) -> Text | None:
27
+ hint_msg = GUESSED_MSG if self.is_attribution_uncertain() else CONFIRMED_MSG
28
+ author_txt = Text(self.author_or_unknown(), style=self.author_style + ' bold')
29
+ return Text(f"({hint_msg} ", style='dim').append(author_txt).append(')')
30
+
31
+ def last_message_at(self, name: str | None) -> datetime:
32
+ return self.messages_by(name)[-1].timestamp()
33
+
34
+ def messages(self) -> list[TextMessage]:
35
+ """Lazily evaluated accessor for self._messages."""
36
+ if len(self._messages) == 0:
37
+ self._messages = [
38
+ TextMessage(
39
+ # If the Sender: is redacted that means it's from self.author
40
+ author=REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip()) or self.author,
41
+ id_confirmed=not self.is_attribution_uncertain(),
42
+ text=match.group(4).strip(),
43
+ timestamp_str=match.group(2).strip(),
44
+ )
45
+ for match in MSG_REGEX.finditer(self.text)
46
+ ]
47
+
48
+ return self._messages
49
+
50
+ def messages_by(self, name: str | None) -> list[TextMessage]:
51
+ """Return all messages by 'name'."""
52
+ return [m for m in self.messages() if m.author == name]
53
+
54
+ def _border_style(self) -> str:
55
+ return self.author_style
56
+
57
+ def _extract_timestamp(self) -> datetime:
58
+ for match in MSG_REGEX.finditer(self.text):
59
+ timestamp_str = match.group(2).strip()
60
+
61
+ try:
62
+ return datetime.strptime(timestamp_str, MSG_DATE_FORMAT)
63
+ except ValueError as e:
64
+ logger.info(f"[WARNING] Failed to parse '{timestamp_str}' to datetime! Using next match. Error: {e}'")
65
+
66
+ raise RuntimeError(f"{self}: No timestamp found!")
67
+
68
+ def __rich_console__(self, _console: Console, _options: ConsoleOptions) -> RenderResult:
69
+ yield self.file_info_panel()
70
+ yield Text('')
71
+
72
+ for message in self.messages():
73
+ yield message
@@ -0,0 +1,117 @@
1
+ import logging
2
+ import warnings
3
+ from dataclasses import dataclass
4
+ from datetime import datetime
5
+
6
+ import datefinder
7
+ import dateutil
8
+ from rich.markup import escape
9
+ from rich.panel import Panel
10
+ from rich.text import Text
11
+
12
+ from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_REGEX, Document
13
+ from epstein_files.util.constants import UNINTERESTING_PREFIXES
14
+ from epstein_files.util.data import escape_single_quotes, remove_timezone, uniquify
15
+ from epstein_files.util.env import args, logger
16
+ from epstein_files.util.rich import highlighter, logger
17
+
18
+ MAX_EXTRACTED_TIMESTAMPS = 100
19
+ MAX_DAYS_SPANNED_TO_BE_VALID = 10
20
+ MIN_TIMESTAMP = datetime(2000, 1, 1)
21
+ MID_TIMESTAMP = datetime(2007, 1, 1)
22
+ MAX_TIMESTAMP = datetime(2022, 12, 31)
23
+ PREVIEW_CHARS = int(580 * (1 if args.all_other_files else 1.5))
24
+ LOG_INDENT = '\n '
25
+ TIMESTAMP_LOG_INDENT = f'{LOG_INDENT} '
26
+ VAST_HOUSE = 'vast house' # Michael Wolff article draft about Epstein indicator
27
+
28
+
29
+ @dataclass
30
+ class OtherFile(Document):
31
+ """File that is not an email, an iMessage log, or JSON data."""
32
+
33
+ def configured_description(self) -> str | None:
34
+ """Overloads superclass method."""
35
+ if self.config is None:
36
+ return None
37
+
38
+ pieces = [p for p in [self.config.author, self.config.description] if p]
39
+ return ' '.join(pieces) if pieces else None
40
+
41
+ def description(self) -> Text:
42
+ """One line summary mostly for logging."""
43
+ return super().description().append(CLOSE_PROPERTIES_CHAR)
44
+
45
+ def description_panel(self, include_hints=True) -> Panel:
46
+ """Panelized description() with info_txt(), used in search results."""
47
+ return super().description_panel(include_hints=include_hints)
48
+
49
+ def highlighted_preview_text(self) -> Text:
50
+ try:
51
+ return highlighter(escape(self.preview_text()))
52
+ except Exception as e:
53
+ logger.error(f"Failed to apply markup in string '{escape_single_quotes(self.preview_text())}'\n"
54
+ f"Original string: '{escape_single_quotes(self.preview_text())}'\n"
55
+ f"File: '{self.filename}'\n")
56
+
57
+ return Text(escape(self.preview_text()))
58
+
59
+ def is_interesting(self):
60
+ """False for lame prefixes and duplicates."""
61
+ hints = self.hints()
62
+
63
+ if self.is_duplicate:
64
+ return False
65
+ elif len(hints) == 0:
66
+ return True
67
+
68
+ for prefix in UNINTERESTING_PREFIXES:
69
+ if hints[0].plain.startswith(prefix):
70
+ return False
71
+
72
+ return True
73
+
74
+ def preview_text(self) -> str:
75
+ return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]
76
+
77
+ def _extract_timestamp(self) -> datetime | None:
78
+ """Return configured timestamp or value extracted by scanning text with datefinder."""
79
+ if self.config and self.config.timestamp:
80
+ return self.config.timestamp
81
+
82
+ timestamps: list[datetime] = []
83
+
84
+ with warnings.catch_warnings():
85
+ warnings.filterwarnings("ignore", module="datefinder")
86
+ warnings.filterwarnings("ignore", module="dateutil")
87
+
88
+ try:
89
+ for timestamp in datefinder.find_dates(self.text, strict=True):
90
+ timestamp = remove_timezone(timestamp)
91
+
92
+ if MIN_TIMESTAMP < timestamp < MAX_TIMESTAMP:
93
+ timestamps.append(timestamp)
94
+
95
+ if len(timestamps) >= MAX_EXTRACTED_TIMESTAMPS:
96
+ break
97
+ except ValueError as e:
98
+ logger.warning(f"Error while iterating through datefinder.find_dates(): {e}")
99
+
100
+ if len(timestamps) == 0:
101
+ self.log_top_lines(15, msg=f"{self.file_id}: No timestamps found", level=logging.INFO)
102
+ return None
103
+ elif len(timestamps) == 1:
104
+ return timestamps[0]
105
+ else:
106
+ timestamps = sorted(uniquify(timestamps), reverse=True)
107
+ self._log_extracted_timestamps_info(timestamps)
108
+ return timestamps[0] # Most recent timestamp appearing in text is usually the closest
109
+
110
+ def _log_extracted_timestamps_info(self, timestamps: list[datetime]) -> None:
111
+ num_days_spanned = (timestamps[0] - timestamps[-1]).days
112
+ timestamps_log_msg = f"Extracted {len(timestamps)} timestamps spanning {num_days_spanned} days{TIMESTAMP_LOG_INDENT}"
113
+ timestamps_log_msg += TIMESTAMP_LOG_INDENT.join([str(dt) for dt in timestamps])
114
+
115
+ if num_days_spanned > MAX_DAYS_SPANNED_TO_BE_VALID and VAST_HOUSE not in self.text:
116
+ log_level = logging.DEBUG if VAST_HOUSE in self.text else logging.INFO
117
+ self.log_top_lines(15, msg=timestamps_log_msg, level=log_level)