epstein-files 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. epstein_files/__init__.py +31 -18
  2. epstein_files/documents/communication.py +9 -5
  3. epstein_files/documents/document.py +225 -136
  4. epstein_files/documents/doj_file.py +242 -0
  5. epstein_files/documents/doj_files/full_text.py +166 -0
  6. epstein_files/documents/email.py +138 -163
  7. epstein_files/documents/emails/email_header.py +21 -11
  8. epstein_files/documents/emails/emailers.py +223 -0
  9. epstein_files/documents/imessage/text_message.py +2 -3
  10. epstein_files/documents/json_file.py +18 -14
  11. epstein_files/documents/messenger_log.py +23 -39
  12. epstein_files/documents/other_file.py +48 -44
  13. epstein_files/epstein_files.py +54 -33
  14. epstein_files/person.py +142 -110
  15. epstein_files/util/constant/names.py +29 -6
  16. epstein_files/util/constant/output_files.py +2 -0
  17. epstein_files/util/constant/strings.py +12 -6
  18. epstein_files/util/constant/urls.py +17 -0
  19. epstein_files/util/constants.py +101 -174
  20. epstein_files/util/data.py +2 -0
  21. epstein_files/util/doc_cfg.py +20 -15
  22. epstein_files/util/env.py +24 -16
  23. epstein_files/util/file_helper.py +28 -6
  24. epstein_files/util/helpers/debugging_helper.py +13 -0
  25. epstein_files/util/helpers/env_helpers.py +21 -0
  26. epstein_files/util/highlighted_group.py +57 -16
  27. epstein_files/util/layout/left_bar_panel.py +26 -0
  28. epstein_files/util/logging.py +28 -13
  29. epstein_files/util/output.py +33 -10
  30. epstein_files/util/rich.py +28 -2
  31. epstein_files/util/word_count.py +7 -7
  32. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/METADATA +14 -1
  33. epstein_files-1.5.0.dist-info/RECORD +40 -0
  34. epstein_files-1.4.1.dist-info/RECORD +0 -34
  35. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
  36. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
  37. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,223 @@
1
+ """
2
+ Regexes and patterns for identifying people in email headers.
3
+ """
4
+ import re
5
+ from copy import deepcopy
6
+
7
+ from epstein_files.util.constant.names import *
8
+ from epstein_files.util.constant.strings import REDACTED
9
+ from epstein_files.util.data import escape_single_quotes
10
+ from epstein_files.util.logging import logger
11
+
12
+ BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)|afiaata|[IM]{4,}).*)$', re.IGNORECASE)
13
+ BAD_NAME_CHARS_REGEX = re.compile(r"[\"'\[\]*><•=()]")
14
+ TIME_REGEX = re.compile(r'^((\d{1,2}/\d{1,2}/\d{2,4}|Thursday|Monday|Tuesday|Wednesday|Friday|Saturday|Sunday)|\d{4} ).*')
15
+
16
+ EMAILER_ID_PATTERNS: dict[str, str] = {
17
+ ALAN_DERSHOWITZ: r'(alan.{1,7})?dershowi(lz?|t?z)|AlanDersh',
18
+ ALIREZA_ITTIHADIEH: r'Alireza.[Il]ttihadieh',
19
+ ALISON_J_NATHAN: r"Alison(\s*J\.?)?\s*Nathan|Nathan NYSD Chambers?",
20
+ AMANDA_ENS: r'ens, amanda?|Amanda.Ens',
21
+ AMIR_TAAKI: r'Amir\s*Taaki|genjix',
22
+ ANAS_ALRASHEED: r'anas\s*al\s*rashee[cd]',
23
+ ANIL_AMBANI: r'Anil.Ambani',
24
+ ANN_MARIE_VILLAFANA: r'Villafana, Ann Marie|(A(\.|nn) Marie )?Villafa(c|n|ri)a',
25
+ ANTHONY_SCARAMUCCI: r"mooch|(Anthony ('The Mooch' )?)?Scaramucci",
26
+ ARIANE_DE_ROTHSCHILD: r'AdeR|((Ariane|Edmond) (de )?)?Rothsh?ch?ild|Ariane(?!\s+Dwyer)',
27
+ BARBRO_C_EHNBOM: r'behnbom@aol.com|(Barbro\s.*)?Ehnbom',
28
+ BARRY_J_COHEN: r'barry\s*((j.?|james)\s*)?cohen?',
29
+ BENNET_MOSKOWITZ: r'Moskowitz.*Bennet|Bennet.*Moskowitz',
30
+ BOB_CROWE: r"[BR]ob Crowe",
31
+ BORIS_NIKOLIC: r'(boris )?nikolic?',
32
+ BRAD_EDWARDS: r'Brad(ley)?(\s*J(.?|ames))?\s*Edwards',
33
+ BRAD_KARP: r'Brad (S.? )?Karp|Karp, Brad',
34
+ CHRISTIAN_EVERDELL: r"C(hristian\s*)?Everdell?",
35
+ CHRISTOPHER_DILORIO: r"Chris\s*Di[lI]o[nr](io)?",
36
+ DANGENE_AND_JENNIE_ENTERPRISE: r'Dangene and Jennie Enterprise?',
37
+ DANNY_FROST: r'Frost, Danny|frostd@dany.nyc.gov|Danny\s*Frost',
38
+ DARREN_INDYKE: r'darren$|Darren\s*(K\.?\s*)?[il]n[dq]_?yke?|dkiesq',
39
+ DAVID_FISZEL: r'David\s*Fis?zel',
40
+ DAVID_HAIG: fr'{DAVID_HAIG}|Haig, David',
41
+ DAVID_STERN: r'David Stern?',
42
+ DOUGLAS_WIGDOR: r'Doug(las)?\s*(H\.?)?\s*Wigdor',
43
+ EDUARDO_ROBLES: r'Ed(uardo)?\s*Robles',
44
+ EDWARD_JAY_EPSTEIN: r'(?<!Jeffrey )Edward (Jay )?Epstein',
45
+ EHUD_BARAK: r'(ehud|e?h)\s*barak|\behud',
46
+ FAITH_KATES: r'faith kates?',
47
+ GERALD_BARTON: r'Gerald.*Barton',
48
+ GERALD_LEFCOURT: r'Gerald\s*(B\.?\s*)?Lefcourt',
49
+ GHISLAINE_MAXWELL: r'g ?max(well)?|Ghislaine|Maxwell',
50
+ HEATHER_MANN: r'Heather Mann?',
51
+ INTELLIGENCE_SQUARED: r'intelligence\s*squared',
52
+ JACKIE_PERCZEK: r'jackie percze[kl]?',
53
+ JABOR_Y: r'[ji]abor\s*y?',
54
+ JAMES_HILL: r"hill, james e.|james.e.hill@abc.com",
55
+ JANUSZ_BANASIAK: r"Janu[is]z Banasiak",
56
+ JEAN_HUGUEN: r"Jean[\s.]Huguen",
57
+ JEAN_LUC_BRUNEL: r'Jean[- ]Luc Brunel?|JeanLuc',
58
+ JEFF_FULLER: r"jeff@mc2mm.com|Jeff Fuller",
59
+ JEFFREY_EPSTEIN: r'[djl]\s?ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeff(rey)? (Edward )?E((sp|ps)tein?)?( VI Foundation)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!(Mark L.|ard Jay) )Epstein',
60
+ JESSICA_CADWELL: r'Jessica Cadwell?',
61
+ JOHNNY_EL_HACHEM: r'el hachem johnny|johnny el hachem',
62
+ JOI_ITO: r'ji@media.mit.?edu|(joichi|joi)( Ito)?',
63
+ JONATHAN_FARKAS: r'Jonathan Fark(a|u)(s|il)',
64
+ KARYNA_SHULIAK: r"Karyna\s*Shuliak?",
65
+ KATHRYN_RUEMMLER: r'Kathr?yn? Ruemmler?',
66
+ KEN_STARR: r'starr, ken|Ken(neth\s*(W.\s*)?)?\s+starr?|starr',
67
+ LANDON_THOMAS: r'lando[nr] thomas( jr)?|thomas jr.?, lando[nr]',
68
+ LARRY_SUMMERS: r'(La(wrence|rry).{1,5})?Summers?|^LH$|LHS|[Il]hsofficel?',
69
+ LAWRANCE_VISOSKI: r'La(rry|wrance) Visoski?|Lvjet',
70
+ LAWRENCE_KRAUSS: r'Lawrence Kraus[es]?|[jl]awkrauss|kruase',
71
+ LEON_BLACK: r'Leon\s*Black?|(?<!Marc )Leon(?! (Botstein|Jaworski|Wieseltier))',
72
+ LILLY_SANCHEZ: r'Lilly.*Sanchez',
73
+ LISA_NEW: r'E?Lisa New?\b',
74
+ MANUELA_MARTINEZ: fr'Manuela (- Mega Partners|Martinez)',
75
+ MARIANA_IDZKOWSKA: r'Mariana [Il]d[źi]kowska?',
76
+ MARK_EPSTEIN: r'Mark (L\. )?(Epstein|Lloyd)',
77
+ MARC_LEON: r'Marc[.\s]+(Kensington|Leon)|Kensington2',
78
+ MARTIN_NOWAK: r'(Martin.*?)?No[vw]ak|Nowak, Martin',
79
+ MARTIN_WEINBERG: r'martin.*?weinberg',
80
+ "Matthew Schafer": r"matthew\.?schafer?",
81
+ MELANIE_SPINELLA: r'M?elanie Spine[Il]{2}a',
82
+ MICHAEL_BUCHHOLTZ: r'Michael.*Buchholtz',
83
+ MICHAEL_MILLER: r'Micha(el)? Miller|Miller, Micha(el)?',
84
+ MICHAEL_SITRICK: r'(Mi(chael|ke).{0,5})?[CS]itrick',
85
+ MICHAEL_WOLFF: r'Michael\s*Wol(f[ef]e?|i)|Wolff',
86
+ MIROSLAV_LAJCAK: r"Miro(slav)?(\s+Laj[cč][aá]k)?",
87
+ MOHAMED_WAHEED_HASSAN: r'Mohamed Waheed(\s+Hassan)?',
88
+ NADIA_MARCINKO: r"Na[dď]i?a\s+Marcinko(v[aá])?",
89
+ NEAL_KASSELL: r'Neal\s*Kassell?',
90
+ NICHOLAS_RIBIS: r'Nic(holas|k)[\s._]Ribi?s?|Ribbis',
91
+ OLIVIER_COLOM: fr'Colom, Olivier|{OLIVIER_COLOM}',
92
+ PAUL_BARRETT: r'Paul Barre(d|tt)',
93
+ PAUL_KRASSNER: r'Pa\s?ul Krassner',
94
+ PAUL_MORRIS: r'morris, paul|Paul Morris',
95
+ PAULA: r'^Paula( Heil Fisher)?$',
96
+ PEGGY_SIEGAL: r'Peggy Siegal?',
97
+ PETER_ATTIA: r'Peter Attia?',
98
+ PETER_MANDELSON: r"((Lord|Peter) )?Mandelson",
99
+ 'pink@mc2mm.com': r"^Pink$|pink@mc2mm\.com",
100
+ PRINCE_ANDREW: r'Prince Andrew|The Duke',
101
+ REID_WEINGARTEN: r'Weingarten, Rei[cdi]|Rei[cdi] Weingarten',
102
+ RICHARD_KAHN: r'rich(ard)? kahn?',
103
+ ROBERT_D_CRITTON_JR: r'Robert D.? Critton,? Jr.?',
104
+ ROBERT_LAWRENCE_KUHN: r'Robert\s*(Lawrence)?\s*Kuhn',
105
+ ROBERT_TRIVERS: r'tri[vy]ersr@gmail|Robert\s*Trivers?',
106
+ ROSS_GOW: fr"Ross(acuity)? Gow|(ross@)?acuity\s*reputation(\.com)?",
107
+ SAMUEL_LEFF: r"Sam(uel)?(/Walli)? Leff",
108
+ SCOTT_J_LINK: r'scott j. link?',
109
+ SEAN_BANNON: r'sean bannon?',
110
+ SHAHER_ABDULHAK_BESHER: r'\bShaher( Abdulhak Besher)?\b',
111
+ SOON_YI_PREVIN: r'Soon[- ]Yi Previn?',
112
+ STACEY_RICHMAN: r"srichmanlaw|Stacey\s*Richman",
113
+ STEPHEN_HANSON: r'ste(phen|ve) hanson?|Shanson900',
114
+ STEVE_BANNON: r'steve banno[nr]?',
115
+ STEVEN_SINOFSKY: r'Steven Sinofsky?',
116
+ SULTAN_BIN_SULAYEM: r'Sultan (Ahmed )?bin Sulaye?m?',
117
+ TERJE_ROD_LARSEN: r"Terje(( (R[øo]e?d[- ])?)?Lars[eo]n)?",
118
+ TERRY_KAFKA: r'Terry Kafka?',
119
+ THANU_BOONYAWATANA: r"Thanu (BOONYAWATANA|Cnx)",
120
+ THORBJORN_JAGLAND: r'(Thor.{3,8})?Jag[il]and?',
121
+ TONJA_HADDAD_COLEMAN: r"To(nj|rl)a Haddad Coleman|haddadfm@aol.com",
122
+ VINCENZO_IOZZO: r"Vincenzo [IL]ozzo",
123
+ }
124
+
125
+ # If found as substring consider them the author
126
+ EMAILERS = [
127
+ 'Anne Boyles',
128
+ AL_SECKEL,
129
+ 'Ariane Dwyer',
130
+ AZIZA_ALAHMADI,
131
+ BILL_GATES,
132
+ BILL_SIEGEL,
133
+ 'Bobbi C Sternheim',
134
+ BRAD_WECHSLER,
135
+ BROCK_PIERCE,
136
+ CHRISTINA_GALBRAITH,
137
+ DANIEL_SABBA,
138
+ 'Danny Goldberg',
139
+ DAVID_SCHOEN,
140
+ DEBBIE_FEIN,
141
+ DEEPAK_CHOPRA,
142
+ GLENN_DUBIN,
143
+ GORDON_GETTY,
144
+ 'Jeff Pagliuca',
145
+ 'Kevin Bright',
146
+ 'Jack Lang',
147
+ JACK_SCAROLA,
148
+ JAY_LEFKOWITZ,
149
+ JES_STALEY,
150
+ JOHN_PAGE,
151
+ 'Jokeland',
152
+ JOSCHA_BACH,
153
+ 'Kathleen Ruderman',
154
+ KENNETH_E_MAPP,
155
+ 'Larry Cohen',
156
+ LESLEY_GROFF,
157
+ 'lorraine@mc2mm.com',
158
+ LINDA_STONE,
159
+ 'Lyn Fontanilla',
160
+ MARK_TRAMO,
161
+ MELANIE_WALKER,
162
+ MERWIN_DELA_CRUZ,
163
+ 'Michael Simmons', # Not the only "To:"
164
+ 'middle.east.update@hotmail.com',
165
+ 'Nancy Cain',
166
+ 'Nancy Dahl',
167
+ 'Nancy Portland',
168
+ 'Nathan NYSD Chambers',
169
+ 'Oliver Goodenough',
170
+ 'Paula Speer',
171
+ 'Peter Aldhous',
172
+ 'Peter Green',
173
+ ROGER_SCHANK,
174
+ 'Roy Black',
175
+ STEVEN_PFEIFFER,
176
+ 'Steven Victor MD',
177
+ 'Susan Edelman',
178
+ TOM_BARRACK,
179
+ 'USANYS',
180
+ 'Vahe Stepanian',
181
+ 'Vladimir Yudashkin',
182
+ ]
183
+
184
+ EMAILER_ID_REGEXES = {name: re.compile(pattern, re.IGNORECASE) for name, pattern in EMAILER_ID_PATTERNS.items()}
185
+ EMAILER_REGEXES = deepcopy(EMAILER_ID_REGEXES) # Keep a copy without the simple EMAILERS regexes
186
+
187
+ # Add simple matching regexes for EMAILERS entries to EMAILER_REGEXES
188
+ for emailer in EMAILERS:
189
+ if emailer in EMAILER_REGEXES:
190
+ raise RuntimeError(f"Can't overwrite emailer regex for '{emailer}'")
191
+
192
+ EMAILER_REGEXES[emailer] = re.compile(emailer + '?', re.IGNORECASE) # Last char optional bc OCR sucks
193
+
194
+ SUPPRESS_LOGS_FOR_AUTHORS = [
195
+ 'Multiple Senders Multiple Senders',
196
+ 'Undisclosed recipients:',
197
+ 'undisclosed-recipients:',
198
+ ]
199
+
200
+
201
+ def cleanup_str(_str: str) -> str:
202
+ return BAD_NAME_CHARS_REGEX.sub('', _str.replace(REDACTED, '')).strip().strip('_').strip()
203
+
204
+
205
+ def extract_emailer_names(emailer_str: str) -> list[str]:
206
+ """Return a list of people's names found in `emailer_str` (email author or recipients field)."""
207
+ emailer_str = cleanup_str(emailer_str)
208
+
209
+ if len(emailer_str) == 0:
210
+ return []
211
+
212
+ names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
213
+
214
+ if len(emailer_str) <= 2 or BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
215
+ if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
216
+ logger.warning(f"No emailer found in '{escape_single_quotes(emailer_str)}'")
217
+ else:
218
+ logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
219
+
220
+ return names_found
221
+
222
+ names_found = names_found or [emailer_str]
223
+ return [reverse_first_and_last_names(name) for name in names_found]
@@ -6,13 +6,12 @@ from rich.text import Text
6
6
 
7
7
  from epstein_files.util.constant.names import ANTHONY_SCARAMUCCI, JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN, Name, extract_last_name
8
8
  from epstein_files.util.constant.strings import TIMESTAMP_DIM
9
- from epstein_files.util.data import iso_timestamp
9
+ from epstein_files.util.data import AMERICAN_DATE_FORMAT, iso_timestamp
10
10
  from epstein_files.util.highlighted_group import get_style_for_name
11
11
  from epstein_files.util.logging import logger
12
12
  from epstein_files.util.rich import TEXT_LINK, highlighter
13
13
 
14
14
  EPSTEIN_TEXTERS = ['e:', 'e:jeeitunes@gmail.com']
15
- MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
16
15
  PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
17
16
  UNCERTAIN_SUFFIX = ' (?)'
18
17
 
@@ -54,7 +53,7 @@ class TextMessage:
54
53
  return self.text.startswith('http')
55
54
 
56
55
  def parse_timestamp(self) -> datetime:
57
- return datetime.strptime(self.timestamp_str, MSG_DATE_FORMAT)
56
+ return datetime.strptime(self.timestamp_str, AMERICAN_DATE_FORMAT)
58
57
 
59
58
  def timestamp_txt(self) -> Text:
60
59
  try:
@@ -28,6 +28,24 @@ class JsonFile(OtherFile):
28
28
  include_description_in_summary_panel: ClassVar[bool] = False
29
29
  strip_whitespace: ClassVar[bool] = False
30
30
 
31
+ @property
32
+ def category(self) -> str:
33
+ return JSON
34
+
35
+ @property
36
+ def is_interesting(self):
37
+ return False
38
+
39
+ @property
40
+ def info_txt(self) -> Text | None:
41
+ return Text(DESCRIPTION, style=INFO_STYLE)
42
+
43
+ @property
44
+ def metadata(self) -> Metadata:
45
+ metadata = super().metadata
46
+ metadata['description'] = DESCRIPTION
47
+ return metadata
48
+
31
49
  def __post_init__(self):
32
50
  super().__post_init__()
33
51
 
@@ -36,23 +54,9 @@ class JsonFile(OtherFile):
36
54
 
37
55
  self._set_computed_fields(text=self.json_str())
38
56
 
39
- def category(self) -> str:
40
- return JSON
41
-
42
- def info_txt(self) -> Text | None:
43
- return Text(DESCRIPTION, style=INFO_STYLE)
44
-
45
- def is_interesting(self):
46
- return False
47
-
48
57
  def json_data(self) -> object:
49
58
  with open(self.file_path, encoding='utf-8-sig') as f:
50
59
  return json.load(f)
51
60
 
52
- def metadata(self) -> Metadata:
53
- metadata = super().metadata()
54
- metadata['description'] = DESCRIPTION
55
- return metadata
56
-
57
61
  def json_str(self) -> str:
58
62
  return json.dumps(self.json_data(), indent=4)
@@ -31,38 +31,30 @@ class MessengerLog(Communication):
31
31
  messages: list[TextMessage] = field(default_factory=list)
32
32
  phone_number: str | None = None
33
33
 
34
- def __post_init__(self):
35
- super().__post_init__()
36
- self.messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
37
-
38
- def first_message_at(self, name: Name) -> datetime:
39
- return self.messages_by(name)[0].parse_timestamp()
34
+ @property
35
+ def border_style(self) -> str:
36
+ return self.author_style
40
37
 
38
+ @property
41
39
  def info_txt(self) -> Text | None:
42
40
  num_days_str = days_between_str(self.timestamp, self.messages[-1].parse_timestamp())
43
41
  txt = Text(f"(Covers {num_days_str} starting ", style='dim')
44
- txt.append(self.date_str(), style=TIMESTAMP_STYLE).append(' ')
42
+ txt.append(self.date_str, style=TIMESTAMP_STYLE).append(' ')
45
43
 
46
44
  if not self.author:
47
45
  txt.append('with unknown counterparty')
48
46
  else:
49
- txt.append(GUESSED_MSG if self.is_attribution_uncertain() else CONFIRMED_MSG).append(' ')
50
- txt.append(Text(self.author, style=self.author_style() + ' bold'))
47
+ txt.append(GUESSED_MSG if self.is_attribution_uncertain else CONFIRMED_MSG).append(' ')
48
+ txt.append(Text(self.author, style=self.author_style + ' bold'))
51
49
 
52
50
  if self.phone_number:
53
51
  txt.append(highlighter(f" using the phone number {self.phone_number}"))
54
52
 
55
53
  return txt.append(')')
56
54
 
57
- def last_message_at(self, name: Name) -> datetime:
58
- return self.messages_by(name)[-1].parse_timestamp()
59
-
60
- def messages_by(self, name: Name) -> list[TextMessage]:
61
- """Return all messages by 'name'."""
62
- return [m for m in self.messages if m.author == name]
63
-
55
+ @property
64
56
  def metadata(self) -> Metadata:
65
- metadata = super().metadata()
57
+ metadata = super().metadata
66
58
  metadata.update({'num_messages': len(self.messages)})
67
59
 
68
60
  if self.phone_number:
@@ -70,8 +62,19 @@ class MessengerLog(Communication):
70
62
 
71
63
  return metadata
72
64
 
73
- def _border_style(self) -> str:
74
- return self.author_style()
65
+ def __post_init__(self):
66
+ super().__post_init__()
67
+ self.messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
68
+
69
+ def first_message_at(self, name: Name) -> datetime:
70
+ return self.messages_by(name)[0].parse_timestamp()
71
+
72
+ def last_message_at(self, name: Name) -> datetime:
73
+ return self.messages_by(name)[-1].parse_timestamp()
74
+
75
+ def messages_by(self, name: Name) -> list[TextMessage]:
76
+ """Return all messages by 'name'."""
77
+ return [m for m in self.messages if m.author == name]
75
78
 
76
79
  def _build_message(self, match: re.Match) -> TextMessage:
77
80
  """Turn a regex match into a TextMessage."""
@@ -86,7 +89,7 @@ class MessengerLog(Communication):
86
89
  return TextMessage(
87
90
  author=self.author if (is_phone_number or not author_str) else author_str,
88
91
  author_str=author_str if is_phone_number else '', # Preserve phone numbers
89
- is_id_confirmed=not self.is_attribution_uncertain(),
92
+ is_id_confirmed=not self.is_attribution_uncertain,
90
93
  text=match.group(4).strip(),
91
94
  timestamp_str=match.group(2).strip(),
92
95
  )
@@ -102,25 +105,6 @@ class MessengerLog(Communication):
102
105
 
103
106
  raise RuntimeError(f"{self}: No timestamp found!")
104
107
 
105
- def _set_message_timestamps(self) -> None:
106
- raise NotImplementedError(f"TextMessage.timestamp no longer exists")
107
- last_message: TextMessage | None = None
108
-
109
- for i, message in enumerate(self.messages):
110
- try:
111
- message.timestamp = message.parse_timestamp()
112
- except Exception as e:
113
- msg = f"Failed to parse timestamp for TextMessage {i + 1}, {message}: {e}"
114
-
115
- if i == 0:
116
- message.timestamp = self.timestamp
117
- self.warn(f"{msg}\nit's the first message so using the MessengerLog timestamp property {self.timestamp}")
118
- else:
119
- message.timestamp = last_message.timestamp + timedelta(milliseconds=1)
120
- self.warn(f"{msg}\nadding 1 millisecond to last timestamp {last_message.timestamp}")
121
-
122
- last_message = message
123
-
124
108
  def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
125
109
  yield self.file_info_panel()
126
110
  yield Text('')
@@ -30,7 +30,6 @@ MAX_DAYS_SPANNED_TO_BE_VALID = 10
30
30
  MAX_EXTRACTED_TIMESTAMPS = 100
31
31
  MIN_TIMESTAMP = datetime(2000, 1, 1)
32
32
  MID_TIMESTAMP = datetime(2007, 1, 1)
33
- MAX_TIMESTAMP = datetime(2022, 12, 31)
34
33
  PREVIEW_CHARS = int(580 * (1 if args.all_other_files else 1.5))
35
34
  LOG_INDENT = '\n '
36
35
  TIMESTAMP_LOG_INDENT = f'{LOG_INDENT} '
@@ -93,40 +92,28 @@ class OtherFile(Document):
93
92
  """
94
93
  was_timestamp_extracted: bool = False
95
94
  include_description_in_summary_panel: ClassVar[bool] = True # Class var for logging output
95
+ max_timestamp: ClassVar[datetime] = datetime(2022, 12, 31) # Overloaded in DojFile
96
96
 
97
- def __post_init__(self):
98
- super().__post_init__()
99
-
100
- if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
101
- self.log(f"Creating synthetic config for VI Daily News article...")
102
- self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
97
+ @property
98
+ def config_description(self) -> str | None:
99
+ """Overloads superclass property."""
100
+ if self.config and self.config.description:
101
+ return self.config.complete_description
103
102
 
103
+ @property
104
104
  def category(self) -> str | None:
105
105
  return self.config and self.config.category
106
106
 
107
+ @property
107
108
  def category_txt(self) -> Text | None:
108
- return styled_category(self.category())
109
-
110
- def config_description(self) -> str | None:
111
- """Overloads superclass method."""
112
- if self.config is not None:
113
- return self.config.complete_description()
114
-
115
- def highlighted_preview_text(self) -> Text:
116
- try:
117
- return highlighter(escape(self.preview_text()))
118
- except Exception as e:
119
- logger.error(f"Failed to apply markup in string '{escape_single_quotes(self.preview_text())}'\n"
120
- f"Original string: '{escape_single_quotes(self.preview_text())}'\n"
121
- f"File: '{self.filename}'\n")
122
-
123
- return Text(escape(self.preview_text()))
109
+ return styled_category(self.category)
124
110
 
111
+ @property
125
112
  def is_interesting(self) -> bool:
126
113
  """Overloaded. False for lame prefixes, duplicates, and other boring files."""
127
- info_sentences = self.info()
114
+ info_sentences = self.info
128
115
 
129
- if self.is_duplicate():
116
+ if self.is_duplicate:
130
117
  return False
131
118
  elif len(info_sentences) == 0:
132
119
  return True
@@ -135,9 +122,9 @@ class OtherFile(Document):
135
122
  return self.config.is_interesting
136
123
  elif self.config.author in INTERESTING_AUTHORS:
137
124
  return True
138
- elif self.category() == FINANCE and self.author is not None:
125
+ elif self.category == FINANCE and self.author is not None:
139
126
  return False
140
- elif self.category() in UNINTERESTING_CATEGORIES:
127
+ elif self.category in UNINTERESTING_CATEGORIES:
141
128
  return False
142
129
 
143
130
  for prefix in UNINTERESTING_PREFIXES:
@@ -146,15 +133,33 @@ class OtherFile(Document):
146
133
 
147
134
  return True
148
135
 
136
+ @property
149
137
  def metadata(self) -> Metadata:
150
- metadata = super().metadata()
151
- metadata['is_interesting'] = self.is_interesting()
138
+ metadata = super().metadata
139
+ metadata['is_interesting'] = self.is_interesting
152
140
 
153
141
  if self.was_timestamp_extracted:
154
142
  metadata['was_timestamp_extracted'] = self.was_timestamp_extracted
155
143
 
156
144
  return metadata
157
145
 
146
+ def __post_init__(self):
147
+ super().__post_init__()
148
+
149
+ if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
150
+ self.log(f"Creating synthetic config for VI Daily News article...")
151
+ self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
152
+
153
+ def highlighted_preview_text(self) -> Text:
154
+ try:
155
+ return highlighter(escape(self.preview_text()))
156
+ except Exception as e:
157
+ logger.error(f"Failed to apply markup in string '{escape_single_quotes(self.preview_text())}'\n"
158
+ f"Original string: '{escape_single_quotes(self.preview_text())}'\n"
159
+ f"File: '{self.filename}'\n")
160
+
161
+ return Text(escape(self.preview_text()))
162
+
158
163
  def preview_text(self) -> str:
159
164
  return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]
160
165
 
@@ -164,9 +169,7 @@ class OtherFile(Document):
164
169
 
165
170
  def _extract_timestamp(self) -> datetime | None:
166
171
  """Return configured timestamp or value extracted by scanning text with datefinder."""
167
- if self.config and self.config.timestamp():
168
- return self.config.timestamp()
169
- elif self.config and any([s in (self.config_description() or '') for s in SKIP_TIMESTAMP_EXTRACT]):
172
+ if self.config and any([s in (self.config_description or '') for s in SKIP_TIMESTAMP_EXTRACT]):
170
173
  return None
171
174
 
172
175
  timestamps: list[datetime] = []
@@ -175,10 +178,11 @@ class OtherFile(Document):
175
178
  warnings.filterwarnings("ignore", module="dateutil")
176
179
 
177
180
  try:
178
- for timestamp in datefinder.find_dates(self.text, strict=True):
181
+ # TODO: datefinder.find_dates() cannot find 08/29/2019 style e.g. in EFTA00005783 :(
182
+ for timestamp in datefinder.find_dates(self.text, strict=False):
179
183
  timestamp = remove_timezone(timestamp)
180
184
 
181
- if MIN_TIMESTAMP < timestamp < MAX_TIMESTAMP:
185
+ if MIN_TIMESTAMP < timestamp < self.max_timestamp:
182
186
  timestamps.append(timestamp)
183
187
 
184
188
  if len(timestamps) >= MAX_EXTRACTED_TIMESTAMPS:
@@ -187,7 +191,7 @@ class OtherFile(Document):
187
191
  self.warn(f"Error while iterating through datefinder.find_dates(): {e}")
188
192
 
189
193
  if len(timestamps) == 0:
190
- if not (self.is_duplicate() or VAST_HOUSE in self.text):
194
+ if not (self.is_duplicate or VAST_HOUSE in self.text):
191
195
  self.log_top_lines(15, msg=f"No timestamps found")
192
196
 
193
197
  return None
@@ -222,21 +226,21 @@ class OtherFile(Document):
222
226
 
223
227
  for file in files:
224
228
  link_and_info = [file.external_links_txt()]
225
- date_str = file.date_str()
229
+ date_str = file.date_str
226
230
 
227
- if file.is_duplicate():
228
- preview_text = file.duplicate_file_txt()
231
+ if file.is_duplicate:
232
+ preview_text = file.duplicate_file_txt
229
233
  row_style = ' dim'
230
234
  else:
231
- link_and_info += file.info()
235
+ link_and_info += file.info
232
236
  preview_text = file.highlighted_preview_text()
233
237
  row_style = ''
234
238
 
235
239
  table.add_row(
236
240
  Group(*link_and_info),
237
241
  Text(date_str, style=TIMESTAMP_STYLE) if date_str else QUESTION_MARKS_TXT,
238
- file.file_size_str(),
239
- file.category_txt(),
242
+ file.file_size_str,
243
+ file.category_txt,
240
244
  preview_text,
241
245
  style=row_style
242
246
  )
@@ -246,12 +250,12 @@ class OtherFile(Document):
246
250
  @classmethod
247
251
  def summary_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
248
252
  """Table showing file count by category."""
249
- categories = uniquify([f.category() for f in files])
250
- categories = sorted(categories, key=lambda c: -len([f for f in files if f.category() == c]))
253
+ categories = uniquify([f.category for f in files])
254
+ categories = sorted(categories, key=lambda c: -len([f for f in files if f.category == c]))
251
255
  table = cls.file_info_table(f'{title_pfx}Other Files Summary', 'Category')
252
256
 
253
257
  for category in categories:
254
- category_files = [f for f in files if f.category() == category]
258
+ category_files = [f for f in files if f.category == category]
255
259
  table.add_row(styled_category(category), *cls.files_info_row(category_files))
256
260
 
257
261
  table.columns = table.columns[:-2] + [table.columns[-1]] # Removee unknown author col