epstein-files 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +31 -18
- epstein_files/documents/communication.py +9 -5
- epstein_files/documents/document.py +225 -136
- epstein_files/documents/doj_file.py +242 -0
- epstein_files/documents/doj_files/full_text.py +166 -0
- epstein_files/documents/email.py +138 -163
- epstein_files/documents/emails/email_header.py +21 -11
- epstein_files/documents/emails/emailers.py +223 -0
- epstein_files/documents/imessage/text_message.py +2 -3
- epstein_files/documents/json_file.py +18 -14
- epstein_files/documents/messenger_log.py +23 -39
- epstein_files/documents/other_file.py +48 -44
- epstein_files/epstein_files.py +54 -33
- epstein_files/person.py +142 -110
- epstein_files/util/constant/names.py +29 -6
- epstein_files/util/constant/output_files.py +2 -0
- epstein_files/util/constant/strings.py +12 -6
- epstein_files/util/constant/urls.py +17 -0
- epstein_files/util/constants.py +101 -174
- epstein_files/util/data.py +2 -0
- epstein_files/util/doc_cfg.py +20 -15
- epstein_files/util/env.py +24 -16
- epstein_files/util/file_helper.py +28 -6
- epstein_files/util/helpers/debugging_helper.py +13 -0
- epstein_files/util/helpers/env_helpers.py +21 -0
- epstein_files/util/highlighted_group.py +57 -16
- epstein_files/util/layout/left_bar_panel.py +26 -0
- epstein_files/util/logging.py +28 -13
- epstein_files/util/output.py +33 -10
- epstein_files/util/rich.py +28 -2
- epstein_files/util/word_count.py +7 -7
- {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/METADATA +14 -1
- epstein_files-1.5.0.dist-info/RECORD +40 -0
- epstein_files-1.4.1.dist-info/RECORD +0 -34
- {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
- {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
- {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Regexes and patterns for identifying people in email headers.
|
|
3
|
+
"""
|
|
4
|
+
import re
|
|
5
|
+
from copy import deepcopy
|
|
6
|
+
|
|
7
|
+
from epstein_files.util.constant.names import *
|
|
8
|
+
from epstein_files.util.constant.strings import REDACTED
|
|
9
|
+
from epstein_files.util.data import escape_single_quotes
|
|
10
|
+
from epstein_files.util.logging import logger
|
|
11
|
+
|
|
12
|
+
BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)|afiaata|[IM]{4,}).*)$', re.IGNORECASE)
|
|
13
|
+
BAD_NAME_CHARS_REGEX = re.compile(r"[\"'\[\]*><•=()]")
|
|
14
|
+
TIME_REGEX = re.compile(r'^((\d{1,2}/\d{1,2}/\d{2,4}|Thursday|Monday|Tuesday|Wednesday|Friday|Saturday|Sunday)|\d{4} ).*')
|
|
15
|
+
|
|
16
|
+
EMAILER_ID_PATTERNS: dict[str, str] = {
|
|
17
|
+
ALAN_DERSHOWITZ: r'(alan.{1,7})?dershowi(lz?|t?z)|AlanDersh',
|
|
18
|
+
ALIREZA_ITTIHADIEH: r'Alireza.[Il]ttihadieh',
|
|
19
|
+
ALISON_J_NATHAN: r"Alison(\s*J\.?)?\s*Nathan|Nathan NYSD Chambers?",
|
|
20
|
+
AMANDA_ENS: r'ens, amanda?|Amanda.Ens',
|
|
21
|
+
AMIR_TAAKI: r'Amir\s*Taaki|genjix',
|
|
22
|
+
ANAS_ALRASHEED: r'anas\s*al\s*rashee[cd]',
|
|
23
|
+
ANIL_AMBANI: r'Anil.Ambani',
|
|
24
|
+
ANN_MARIE_VILLAFANA: r'Villafana, Ann Marie|(A(\.|nn) Marie )?Villafa(c|n|ri)a',
|
|
25
|
+
ANTHONY_SCARAMUCCI: r"mooch|(Anthony ('The Mooch' )?)?Scaramucci",
|
|
26
|
+
ARIANE_DE_ROTHSCHILD: r'AdeR|((Ariane|Edmond) (de )?)?Rothsh?ch?ild|Ariane(?!\s+Dwyer)',
|
|
27
|
+
BARBRO_C_EHNBOM: r'behnbom@aol.com|(Barbro\s.*)?Ehnbom',
|
|
28
|
+
BARRY_J_COHEN: r'barry\s*((j.?|james)\s*)?cohen?',
|
|
29
|
+
BENNET_MOSKOWITZ: r'Moskowitz.*Bennet|Bennet.*Moskowitz',
|
|
30
|
+
BOB_CROWE: r"[BR]ob Crowe",
|
|
31
|
+
BORIS_NIKOLIC: r'(boris )?nikolic?',
|
|
32
|
+
BRAD_EDWARDS: r'Brad(ley)?(\s*J(.?|ames))?\s*Edwards',
|
|
33
|
+
BRAD_KARP: r'Brad (S.? )?Karp|Karp, Brad',
|
|
34
|
+
CHRISTIAN_EVERDELL: r"C(hristian\s*)?Everdell?",
|
|
35
|
+
CHRISTOPHER_DILORIO: r"Chris\s*Di[lI]o[nr](io)?",
|
|
36
|
+
DANGENE_AND_JENNIE_ENTERPRISE: r'Dangene and Jennie Enterprise?',
|
|
37
|
+
DANNY_FROST: r'Frost, Danny|frostd@dany.nyc.gov|Danny\s*Frost',
|
|
38
|
+
DARREN_INDYKE: r'darren$|Darren\s*(K\.?\s*)?[il]n[dq]_?yke?|dkiesq',
|
|
39
|
+
DAVID_FISZEL: r'David\s*Fis?zel',
|
|
40
|
+
DAVID_HAIG: fr'{DAVID_HAIG}|Haig, David',
|
|
41
|
+
DAVID_STERN: r'David Stern?',
|
|
42
|
+
DOUGLAS_WIGDOR: r'Doug(las)?\s*(H\.?)?\s*Wigdor',
|
|
43
|
+
EDUARDO_ROBLES: r'Ed(uardo)?\s*Robles',
|
|
44
|
+
EDWARD_JAY_EPSTEIN: r'(?<!Jeffrey )Edward (Jay )?Epstein',
|
|
45
|
+
EHUD_BARAK: r'(ehud|e?h)\s*barak|\behud',
|
|
46
|
+
FAITH_KATES: r'faith kates?',
|
|
47
|
+
GERALD_BARTON: r'Gerald.*Barton',
|
|
48
|
+
GERALD_LEFCOURT: r'Gerald\s*(B\.?\s*)?Lefcourt',
|
|
49
|
+
GHISLAINE_MAXWELL: r'g ?max(well)?|Ghislaine|Maxwell',
|
|
50
|
+
HEATHER_MANN: r'Heather Mann?',
|
|
51
|
+
INTELLIGENCE_SQUARED: r'intelligence\s*squared',
|
|
52
|
+
JACKIE_PERCZEK: r'jackie percze[kl]?',
|
|
53
|
+
JABOR_Y: r'[ji]abor\s*y?',
|
|
54
|
+
JAMES_HILL: r"hill, james e.|james.e.hill@abc.com",
|
|
55
|
+
JANUSZ_BANASIAK: r"Janu[is]z Banasiak",
|
|
56
|
+
JEAN_HUGUEN: r"Jean[\s.]Huguen",
|
|
57
|
+
JEAN_LUC_BRUNEL: r'Jean[- ]Luc Brunel?|JeanLuc',
|
|
58
|
+
JEFF_FULLER: r"jeff@mc2mm.com|Jeff Fuller",
|
|
59
|
+
JEFFREY_EPSTEIN: r'[djl]\s?ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeff(rey)? (Edward )?E((sp|ps)tein?)?( VI Foundation)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!(Mark L.|ard Jay) )Epstein',
|
|
60
|
+
JESSICA_CADWELL: r'Jessica Cadwell?',
|
|
61
|
+
JOHNNY_EL_HACHEM: r'el hachem johnny|johnny el hachem',
|
|
62
|
+
JOI_ITO: r'ji@media.mit.?edu|(joichi|joi)( Ito)?',
|
|
63
|
+
JONATHAN_FARKAS: r'Jonathan Fark(a|u)(s|il)',
|
|
64
|
+
KARYNA_SHULIAK: r"Karyna\s*Shuliak?",
|
|
65
|
+
KATHRYN_RUEMMLER: r'Kathr?yn? Ruemmler?',
|
|
66
|
+
KEN_STARR: r'starr, ken|Ken(neth\s*(W.\s*)?)?\s+starr?|starr',
|
|
67
|
+
LANDON_THOMAS: r'lando[nr] thomas( jr)?|thomas jr.?, lando[nr]',
|
|
68
|
+
LARRY_SUMMERS: r'(La(wrence|rry).{1,5})?Summers?|^LH$|LHS|[Il]hsofficel?',
|
|
69
|
+
LAWRANCE_VISOSKI: r'La(rry|wrance) Visoski?|Lvjet',
|
|
70
|
+
LAWRENCE_KRAUSS: r'Lawrence Kraus[es]?|[jl]awkrauss|kruase',
|
|
71
|
+
LEON_BLACK: r'Leon\s*Black?|(?<!Marc )Leon(?! (Botstein|Jaworski|Wieseltier))',
|
|
72
|
+
LILLY_SANCHEZ: r'Lilly.*Sanchez',
|
|
73
|
+
LISA_NEW: r'E?Lisa New?\b',
|
|
74
|
+
MANUELA_MARTINEZ: fr'Manuela (- Mega Partners|Martinez)',
|
|
75
|
+
MARIANA_IDZKOWSKA: r'Mariana [Il]d[źi]kowska?',
|
|
76
|
+
MARK_EPSTEIN: r'Mark (L\. )?(Epstein|Lloyd)',
|
|
77
|
+
MARC_LEON: r'Marc[.\s]+(Kensington|Leon)|Kensington2',
|
|
78
|
+
MARTIN_NOWAK: r'(Martin.*?)?No[vw]ak|Nowak, Martin',
|
|
79
|
+
MARTIN_WEINBERG: r'martin.*?weinberg',
|
|
80
|
+
"Matthew Schafer": r"matthew\.?schafer?",
|
|
81
|
+
MELANIE_SPINELLA: r'M?elanie Spine[Il]{2}a',
|
|
82
|
+
MICHAEL_BUCHHOLTZ: r'Michael.*Buchholtz',
|
|
83
|
+
MICHAEL_MILLER: r'Micha(el)? Miller|Miller, Micha(el)?',
|
|
84
|
+
MICHAEL_SITRICK: r'(Mi(chael|ke).{0,5})?[CS]itrick',
|
|
85
|
+
MICHAEL_WOLFF: r'Michael\s*Wol(f[ef]e?|i)|Wolff',
|
|
86
|
+
MIROSLAV_LAJCAK: r"Miro(slav)?(\s+Laj[cč][aá]k)?",
|
|
87
|
+
MOHAMED_WAHEED_HASSAN: r'Mohamed Waheed(\s+Hassan)?',
|
|
88
|
+
NADIA_MARCINKO: r"Na[dď]i?a\s+Marcinko(v[aá])?",
|
|
89
|
+
NEAL_KASSELL: r'Neal\s*Kassell?',
|
|
90
|
+
NICHOLAS_RIBIS: r'Nic(holas|k)[\s._]Ribi?s?|Ribbis',
|
|
91
|
+
OLIVIER_COLOM: fr'Colom, Olivier|{OLIVIER_COLOM}',
|
|
92
|
+
PAUL_BARRETT: r'Paul Barre(d|tt)',
|
|
93
|
+
PAUL_KRASSNER: r'Pa\s?ul Krassner',
|
|
94
|
+
PAUL_MORRIS: r'morris, paul|Paul Morris',
|
|
95
|
+
PAULA: r'^Paula( Heil Fisher)?$',
|
|
96
|
+
PEGGY_SIEGAL: r'Peggy Siegal?',
|
|
97
|
+
PETER_ATTIA: r'Peter Attia?',
|
|
98
|
+
PETER_MANDELSON: r"((Lord|Peter) )?Mandelson",
|
|
99
|
+
'pink@mc2mm.com': r"^Pink$|pink@mc2mm\.com",
|
|
100
|
+
PRINCE_ANDREW: r'Prince Andrew|The Duke',
|
|
101
|
+
REID_WEINGARTEN: r'Weingarten, Rei[cdi]|Rei[cdi] Weingarten',
|
|
102
|
+
RICHARD_KAHN: r'rich(ard)? kahn?',
|
|
103
|
+
ROBERT_D_CRITTON_JR: r'Robert D.? Critton,? Jr.?',
|
|
104
|
+
ROBERT_LAWRENCE_KUHN: r'Robert\s*(Lawrence)?\s*Kuhn',
|
|
105
|
+
ROBERT_TRIVERS: r'tri[vy]ersr@gmail|Robert\s*Trivers?',
|
|
106
|
+
ROSS_GOW: fr"Ross(acuity)? Gow|(ross@)?acuity\s*reputation(\.com)?",
|
|
107
|
+
SAMUEL_LEFF: r"Sam(uel)?(/Walli)? Leff",
|
|
108
|
+
SCOTT_J_LINK: r'scott j. link?',
|
|
109
|
+
SEAN_BANNON: r'sean bannon?',
|
|
110
|
+
SHAHER_ABDULHAK_BESHER: r'\bShaher( Abdulhak Besher)?\b',
|
|
111
|
+
SOON_YI_PREVIN: r'Soon[- ]Yi Previn?',
|
|
112
|
+
STACEY_RICHMAN: r"srichmanlaw|Stacey\s*Richman",
|
|
113
|
+
STEPHEN_HANSON: r'ste(phen|ve) hanson?|Shanson900',
|
|
114
|
+
STEVE_BANNON: r'steve banno[nr]?',
|
|
115
|
+
STEVEN_SINOFSKY: r'Steven Sinofsky?',
|
|
116
|
+
SULTAN_BIN_SULAYEM: r'Sultan (Ahmed )?bin Sulaye?m?',
|
|
117
|
+
TERJE_ROD_LARSEN: r"Terje(( (R[øo]e?d[- ])?)?Lars[eo]n)?",
|
|
118
|
+
TERRY_KAFKA: r'Terry Kafka?',
|
|
119
|
+
THANU_BOONYAWATANA: r"Thanu (BOONYAWATANA|Cnx)",
|
|
120
|
+
THORBJORN_JAGLAND: r'(Thor.{3,8})?Jag[il]and?',
|
|
121
|
+
TONJA_HADDAD_COLEMAN: r"To(nj|rl)a Haddad Coleman|haddadfm@aol.com",
|
|
122
|
+
VINCENZO_IOZZO: r"Vincenzo [IL]ozzo",
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
# If found as substring consider them the author
|
|
126
|
+
EMAILERS = [
|
|
127
|
+
'Anne Boyles',
|
|
128
|
+
AL_SECKEL,
|
|
129
|
+
'Ariane Dwyer',
|
|
130
|
+
AZIZA_ALAHMADI,
|
|
131
|
+
BILL_GATES,
|
|
132
|
+
BILL_SIEGEL,
|
|
133
|
+
'Bobbi C Sternheim',
|
|
134
|
+
BRAD_WECHSLER,
|
|
135
|
+
BROCK_PIERCE,
|
|
136
|
+
CHRISTINA_GALBRAITH,
|
|
137
|
+
DANIEL_SABBA,
|
|
138
|
+
'Danny Goldberg',
|
|
139
|
+
DAVID_SCHOEN,
|
|
140
|
+
DEBBIE_FEIN,
|
|
141
|
+
DEEPAK_CHOPRA,
|
|
142
|
+
GLENN_DUBIN,
|
|
143
|
+
GORDON_GETTY,
|
|
144
|
+
'Jeff Pagliuca',
|
|
145
|
+
'Kevin Bright',
|
|
146
|
+
'Jack Lang',
|
|
147
|
+
JACK_SCAROLA,
|
|
148
|
+
JAY_LEFKOWITZ,
|
|
149
|
+
JES_STALEY,
|
|
150
|
+
JOHN_PAGE,
|
|
151
|
+
'Jokeland',
|
|
152
|
+
JOSCHA_BACH,
|
|
153
|
+
'Kathleen Ruderman',
|
|
154
|
+
KENNETH_E_MAPP,
|
|
155
|
+
'Larry Cohen',
|
|
156
|
+
LESLEY_GROFF,
|
|
157
|
+
'lorraine@mc2mm.com',
|
|
158
|
+
LINDA_STONE,
|
|
159
|
+
'Lyn Fontanilla',
|
|
160
|
+
MARK_TRAMO,
|
|
161
|
+
MELANIE_WALKER,
|
|
162
|
+
MERWIN_DELA_CRUZ,
|
|
163
|
+
'Michael Simmons', # Not the only "To:"
|
|
164
|
+
'middle.east.update@hotmail.com',
|
|
165
|
+
'Nancy Cain',
|
|
166
|
+
'Nancy Dahl',
|
|
167
|
+
'Nancy Portland',
|
|
168
|
+
'Nathan NYSD Chambers',
|
|
169
|
+
'Oliver Goodenough',
|
|
170
|
+
'Paula Speer',
|
|
171
|
+
'Peter Aldhous',
|
|
172
|
+
'Peter Green',
|
|
173
|
+
ROGER_SCHANK,
|
|
174
|
+
'Roy Black',
|
|
175
|
+
STEVEN_PFEIFFER,
|
|
176
|
+
'Steven Victor MD',
|
|
177
|
+
'Susan Edelman',
|
|
178
|
+
TOM_BARRACK,
|
|
179
|
+
'USANYS',
|
|
180
|
+
'Vahe Stepanian',
|
|
181
|
+
'Vladimir Yudashkin',
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
EMAILER_ID_REGEXES = {name: re.compile(pattern, re.IGNORECASE) for name, pattern in EMAILER_ID_PATTERNS.items()}
|
|
185
|
+
EMAILER_REGEXES = deepcopy(EMAILER_ID_REGEXES) # Keep a copy without the simple EMAILERS regexes
|
|
186
|
+
|
|
187
|
+
# Add simple matching regexes for EMAILERS entries to EMAILER_REGEXES
|
|
188
|
+
for emailer in EMAILERS:
|
|
189
|
+
if emailer in EMAILER_REGEXES:
|
|
190
|
+
raise RuntimeError(f"Can't overwrite emailer regex for '{emailer}'")
|
|
191
|
+
|
|
192
|
+
EMAILER_REGEXES[emailer] = re.compile(emailer + '?', re.IGNORECASE) # Last char optional bc OCR sucks
|
|
193
|
+
|
|
194
|
+
SUPPRESS_LOGS_FOR_AUTHORS = [
|
|
195
|
+
'Multiple Senders Multiple Senders',
|
|
196
|
+
'Undisclosed recipients:',
|
|
197
|
+
'undisclosed-recipients:',
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def cleanup_str(_str: str) -> str:
|
|
202
|
+
return BAD_NAME_CHARS_REGEX.sub('', _str.replace(REDACTED, '')).strip().strip('_').strip()
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def extract_emailer_names(emailer_str: str) -> list[str]:
|
|
206
|
+
"""Return a list of people's names found in `emailer_str` (email author or recipients field)."""
|
|
207
|
+
emailer_str = cleanup_str(emailer_str)
|
|
208
|
+
|
|
209
|
+
if len(emailer_str) == 0:
|
|
210
|
+
return []
|
|
211
|
+
|
|
212
|
+
names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
|
|
213
|
+
|
|
214
|
+
if len(emailer_str) <= 2 or BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
|
|
215
|
+
if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
|
|
216
|
+
logger.warning(f"No emailer found in '{escape_single_quotes(emailer_str)}'")
|
|
217
|
+
else:
|
|
218
|
+
logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
|
|
219
|
+
|
|
220
|
+
return names_found
|
|
221
|
+
|
|
222
|
+
names_found = names_found or [emailer_str]
|
|
223
|
+
return [reverse_first_and_last_names(name) for name in names_found]
|
|
@@ -6,13 +6,12 @@ from rich.text import Text
|
|
|
6
6
|
|
|
7
7
|
from epstein_files.util.constant.names import ANTHONY_SCARAMUCCI, JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN, Name, extract_last_name
|
|
8
8
|
from epstein_files.util.constant.strings import TIMESTAMP_DIM
|
|
9
|
-
from epstein_files.util.data import iso_timestamp
|
|
9
|
+
from epstein_files.util.data import AMERICAN_DATE_FORMAT, iso_timestamp
|
|
10
10
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
11
11
|
from epstein_files.util.logging import logger
|
|
12
12
|
from epstein_files.util.rich import TEXT_LINK, highlighter
|
|
13
13
|
|
|
14
14
|
EPSTEIN_TEXTERS = ['e:', 'e:jeeitunes@gmail.com']
|
|
15
|
-
MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
|
|
16
15
|
PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
|
|
17
16
|
UNCERTAIN_SUFFIX = ' (?)'
|
|
18
17
|
|
|
@@ -54,7 +53,7 @@ class TextMessage:
|
|
|
54
53
|
return self.text.startswith('http')
|
|
55
54
|
|
|
56
55
|
def parse_timestamp(self) -> datetime:
|
|
57
|
-
return datetime.strptime(self.timestamp_str,
|
|
56
|
+
return datetime.strptime(self.timestamp_str, AMERICAN_DATE_FORMAT)
|
|
58
57
|
|
|
59
58
|
def timestamp_txt(self) -> Text:
|
|
60
59
|
try:
|
|
@@ -28,6 +28,24 @@ class JsonFile(OtherFile):
|
|
|
28
28
|
include_description_in_summary_panel: ClassVar[bool] = False
|
|
29
29
|
strip_whitespace: ClassVar[bool] = False
|
|
30
30
|
|
|
31
|
+
@property
|
|
32
|
+
def category(self) -> str:
|
|
33
|
+
return JSON
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def is_interesting(self):
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def info_txt(self) -> Text | None:
|
|
41
|
+
return Text(DESCRIPTION, style=INFO_STYLE)
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def metadata(self) -> Metadata:
|
|
45
|
+
metadata = super().metadata
|
|
46
|
+
metadata['description'] = DESCRIPTION
|
|
47
|
+
return metadata
|
|
48
|
+
|
|
31
49
|
def __post_init__(self):
|
|
32
50
|
super().__post_init__()
|
|
33
51
|
|
|
@@ -36,23 +54,9 @@ class JsonFile(OtherFile):
|
|
|
36
54
|
|
|
37
55
|
self._set_computed_fields(text=self.json_str())
|
|
38
56
|
|
|
39
|
-
def category(self) -> str:
|
|
40
|
-
return JSON
|
|
41
|
-
|
|
42
|
-
def info_txt(self) -> Text | None:
|
|
43
|
-
return Text(DESCRIPTION, style=INFO_STYLE)
|
|
44
|
-
|
|
45
|
-
def is_interesting(self):
|
|
46
|
-
return False
|
|
47
|
-
|
|
48
57
|
def json_data(self) -> object:
|
|
49
58
|
with open(self.file_path, encoding='utf-8-sig') as f:
|
|
50
59
|
return json.load(f)
|
|
51
60
|
|
|
52
|
-
def metadata(self) -> Metadata:
|
|
53
|
-
metadata = super().metadata()
|
|
54
|
-
metadata['description'] = DESCRIPTION
|
|
55
|
-
return metadata
|
|
56
|
-
|
|
57
61
|
def json_str(self) -> str:
|
|
58
62
|
return json.dumps(self.json_data(), indent=4)
|
|
@@ -31,38 +31,30 @@ class MessengerLog(Communication):
|
|
|
31
31
|
messages: list[TextMessage] = field(default_factory=list)
|
|
32
32
|
phone_number: str | None = None
|
|
33
33
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def first_message_at(self, name: Name) -> datetime:
|
|
39
|
-
return self.messages_by(name)[0].parse_timestamp()
|
|
34
|
+
@property
|
|
35
|
+
def border_style(self) -> str:
|
|
36
|
+
return self.author_style
|
|
40
37
|
|
|
38
|
+
@property
|
|
41
39
|
def info_txt(self) -> Text | None:
|
|
42
40
|
num_days_str = days_between_str(self.timestamp, self.messages[-1].parse_timestamp())
|
|
43
41
|
txt = Text(f"(Covers {num_days_str} starting ", style='dim')
|
|
44
|
-
txt.append(self.date_str
|
|
42
|
+
txt.append(self.date_str, style=TIMESTAMP_STYLE).append(' ')
|
|
45
43
|
|
|
46
44
|
if not self.author:
|
|
47
45
|
txt.append('with unknown counterparty')
|
|
48
46
|
else:
|
|
49
|
-
txt.append(GUESSED_MSG if self.is_attribution_uncertain
|
|
50
|
-
txt.append(Text(self.author, style=self.author_style
|
|
47
|
+
txt.append(GUESSED_MSG if self.is_attribution_uncertain else CONFIRMED_MSG).append(' ')
|
|
48
|
+
txt.append(Text(self.author, style=self.author_style + ' bold'))
|
|
51
49
|
|
|
52
50
|
if self.phone_number:
|
|
53
51
|
txt.append(highlighter(f" using the phone number {self.phone_number}"))
|
|
54
52
|
|
|
55
53
|
return txt.append(')')
|
|
56
54
|
|
|
57
|
-
|
|
58
|
-
return self.messages_by(name)[-1].parse_timestamp()
|
|
59
|
-
|
|
60
|
-
def messages_by(self, name: Name) -> list[TextMessage]:
|
|
61
|
-
"""Return all messages by 'name'."""
|
|
62
|
-
return [m for m in self.messages if m.author == name]
|
|
63
|
-
|
|
55
|
+
@property
|
|
64
56
|
def metadata(self) -> Metadata:
|
|
65
|
-
metadata = super().metadata
|
|
57
|
+
metadata = super().metadata
|
|
66
58
|
metadata.update({'num_messages': len(self.messages)})
|
|
67
59
|
|
|
68
60
|
if self.phone_number:
|
|
@@ -70,8 +62,19 @@ class MessengerLog(Communication):
|
|
|
70
62
|
|
|
71
63
|
return metadata
|
|
72
64
|
|
|
73
|
-
def
|
|
74
|
-
|
|
65
|
+
def __post_init__(self):
|
|
66
|
+
super().__post_init__()
|
|
67
|
+
self.messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
|
|
68
|
+
|
|
69
|
+
def first_message_at(self, name: Name) -> datetime:
|
|
70
|
+
return self.messages_by(name)[0].parse_timestamp()
|
|
71
|
+
|
|
72
|
+
def last_message_at(self, name: Name) -> datetime:
|
|
73
|
+
return self.messages_by(name)[-1].parse_timestamp()
|
|
74
|
+
|
|
75
|
+
def messages_by(self, name: Name) -> list[TextMessage]:
|
|
76
|
+
"""Return all messages by 'name'."""
|
|
77
|
+
return [m for m in self.messages if m.author == name]
|
|
75
78
|
|
|
76
79
|
def _build_message(self, match: re.Match) -> TextMessage:
|
|
77
80
|
"""Turn a regex match into a TextMessage."""
|
|
@@ -86,7 +89,7 @@ class MessengerLog(Communication):
|
|
|
86
89
|
return TextMessage(
|
|
87
90
|
author=self.author if (is_phone_number or not author_str) else author_str,
|
|
88
91
|
author_str=author_str if is_phone_number else '', # Preserve phone numbers
|
|
89
|
-
is_id_confirmed=not self.is_attribution_uncertain
|
|
92
|
+
is_id_confirmed=not self.is_attribution_uncertain,
|
|
90
93
|
text=match.group(4).strip(),
|
|
91
94
|
timestamp_str=match.group(2).strip(),
|
|
92
95
|
)
|
|
@@ -102,25 +105,6 @@ class MessengerLog(Communication):
|
|
|
102
105
|
|
|
103
106
|
raise RuntimeError(f"{self}: No timestamp found!")
|
|
104
107
|
|
|
105
|
-
def _set_message_timestamps(self) -> None:
|
|
106
|
-
raise NotImplementedError(f"TextMessage.timestamp no longer exists")
|
|
107
|
-
last_message: TextMessage | None = None
|
|
108
|
-
|
|
109
|
-
for i, message in enumerate(self.messages):
|
|
110
|
-
try:
|
|
111
|
-
message.timestamp = message.parse_timestamp()
|
|
112
|
-
except Exception as e:
|
|
113
|
-
msg = f"Failed to parse timestamp for TextMessage {i + 1}, {message}: {e}"
|
|
114
|
-
|
|
115
|
-
if i == 0:
|
|
116
|
-
message.timestamp = self.timestamp
|
|
117
|
-
self.warn(f"{msg}\nit's the first message so using the MessengerLog timestamp property {self.timestamp}")
|
|
118
|
-
else:
|
|
119
|
-
message.timestamp = last_message.timestamp + timedelta(milliseconds=1)
|
|
120
|
-
self.warn(f"{msg}\nadding 1 millisecond to last timestamp {last_message.timestamp}")
|
|
121
|
-
|
|
122
|
-
last_message = message
|
|
123
|
-
|
|
124
108
|
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
125
109
|
yield self.file_info_panel()
|
|
126
110
|
yield Text('')
|
|
@@ -30,7 +30,6 @@ MAX_DAYS_SPANNED_TO_BE_VALID = 10
|
|
|
30
30
|
MAX_EXTRACTED_TIMESTAMPS = 100
|
|
31
31
|
MIN_TIMESTAMP = datetime(2000, 1, 1)
|
|
32
32
|
MID_TIMESTAMP = datetime(2007, 1, 1)
|
|
33
|
-
MAX_TIMESTAMP = datetime(2022, 12, 31)
|
|
34
33
|
PREVIEW_CHARS = int(580 * (1 if args.all_other_files else 1.5))
|
|
35
34
|
LOG_INDENT = '\n '
|
|
36
35
|
TIMESTAMP_LOG_INDENT = f'{LOG_INDENT} '
|
|
@@ -93,40 +92,28 @@ class OtherFile(Document):
|
|
|
93
92
|
"""
|
|
94
93
|
was_timestamp_extracted: bool = False
|
|
95
94
|
include_description_in_summary_panel: ClassVar[bool] = True # Class var for logging output
|
|
95
|
+
max_timestamp: ClassVar[datetime] = datetime(2022, 12, 31) # Overloaded in DojFile
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
if self.config
|
|
101
|
-
self.
|
|
102
|
-
self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
|
|
97
|
+
@property
|
|
98
|
+
def config_description(self) -> str | None:
|
|
99
|
+
"""Overloads superclass property."""
|
|
100
|
+
if self.config and self.config.description:
|
|
101
|
+
return self.config.complete_description
|
|
103
102
|
|
|
103
|
+
@property
|
|
104
104
|
def category(self) -> str | None:
|
|
105
105
|
return self.config and self.config.category
|
|
106
106
|
|
|
107
|
+
@property
|
|
107
108
|
def category_txt(self) -> Text | None:
|
|
108
|
-
return styled_category(self.category
|
|
109
|
-
|
|
110
|
-
def config_description(self) -> str | None:
|
|
111
|
-
"""Overloads superclass method."""
|
|
112
|
-
if self.config is not None:
|
|
113
|
-
return self.config.complete_description()
|
|
114
|
-
|
|
115
|
-
def highlighted_preview_text(self) -> Text:
|
|
116
|
-
try:
|
|
117
|
-
return highlighter(escape(self.preview_text()))
|
|
118
|
-
except Exception as e:
|
|
119
|
-
logger.error(f"Failed to apply markup in string '{escape_single_quotes(self.preview_text())}'\n"
|
|
120
|
-
f"Original string: '{escape_single_quotes(self.preview_text())}'\n"
|
|
121
|
-
f"File: '{self.filename}'\n")
|
|
122
|
-
|
|
123
|
-
return Text(escape(self.preview_text()))
|
|
109
|
+
return styled_category(self.category)
|
|
124
110
|
|
|
111
|
+
@property
|
|
125
112
|
def is_interesting(self) -> bool:
|
|
126
113
|
"""Overloaded. False for lame prefixes, duplicates, and other boring files."""
|
|
127
|
-
info_sentences = self.info
|
|
114
|
+
info_sentences = self.info
|
|
128
115
|
|
|
129
|
-
if self.is_duplicate
|
|
116
|
+
if self.is_duplicate:
|
|
130
117
|
return False
|
|
131
118
|
elif len(info_sentences) == 0:
|
|
132
119
|
return True
|
|
@@ -135,9 +122,9 @@ class OtherFile(Document):
|
|
|
135
122
|
return self.config.is_interesting
|
|
136
123
|
elif self.config.author in INTERESTING_AUTHORS:
|
|
137
124
|
return True
|
|
138
|
-
elif self.category
|
|
125
|
+
elif self.category == FINANCE and self.author is not None:
|
|
139
126
|
return False
|
|
140
|
-
elif self.category
|
|
127
|
+
elif self.category in UNINTERESTING_CATEGORIES:
|
|
141
128
|
return False
|
|
142
129
|
|
|
143
130
|
for prefix in UNINTERESTING_PREFIXES:
|
|
@@ -146,15 +133,33 @@ class OtherFile(Document):
|
|
|
146
133
|
|
|
147
134
|
return True
|
|
148
135
|
|
|
136
|
+
@property
|
|
149
137
|
def metadata(self) -> Metadata:
|
|
150
|
-
metadata = super().metadata
|
|
151
|
-
metadata['is_interesting'] = self.is_interesting
|
|
138
|
+
metadata = super().metadata
|
|
139
|
+
metadata['is_interesting'] = self.is_interesting
|
|
152
140
|
|
|
153
141
|
if self.was_timestamp_extracted:
|
|
154
142
|
metadata['was_timestamp_extracted'] = self.was_timestamp_extracted
|
|
155
143
|
|
|
156
144
|
return metadata
|
|
157
145
|
|
|
146
|
+
def __post_init__(self):
|
|
147
|
+
super().__post_init__()
|
|
148
|
+
|
|
149
|
+
if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
|
|
150
|
+
self.log(f"Creating synthetic config for VI Daily News article...")
|
|
151
|
+
self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
|
|
152
|
+
|
|
153
|
+
def highlighted_preview_text(self) -> Text:
|
|
154
|
+
try:
|
|
155
|
+
return highlighter(escape(self.preview_text()))
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logger.error(f"Failed to apply markup in string '{escape_single_quotes(self.preview_text())}'\n"
|
|
158
|
+
f"Original string: '{escape_single_quotes(self.preview_text())}'\n"
|
|
159
|
+
f"File: '{self.filename}'\n")
|
|
160
|
+
|
|
161
|
+
return Text(escape(self.preview_text()))
|
|
162
|
+
|
|
158
163
|
def preview_text(self) -> str:
|
|
159
164
|
return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]
|
|
160
165
|
|
|
@@ -164,9 +169,7 @@ class OtherFile(Document):
|
|
|
164
169
|
|
|
165
170
|
def _extract_timestamp(self) -> datetime | None:
|
|
166
171
|
"""Return configured timestamp or value extracted by scanning text with datefinder."""
|
|
167
|
-
if self.config and self.
|
|
168
|
-
return self.config.timestamp()
|
|
169
|
-
elif self.config and any([s in (self.config_description() or '') for s in SKIP_TIMESTAMP_EXTRACT]):
|
|
172
|
+
if self.config and any([s in (self.config_description or '') for s in SKIP_TIMESTAMP_EXTRACT]):
|
|
170
173
|
return None
|
|
171
174
|
|
|
172
175
|
timestamps: list[datetime] = []
|
|
@@ -175,10 +178,11 @@ class OtherFile(Document):
|
|
|
175
178
|
warnings.filterwarnings("ignore", module="dateutil")
|
|
176
179
|
|
|
177
180
|
try:
|
|
178
|
-
|
|
181
|
+
# TODO: datefinder.find_dates() cannot find 08/29/2019 style e.g. in EFTA00005783 :(
|
|
182
|
+
for timestamp in datefinder.find_dates(self.text, strict=False):
|
|
179
183
|
timestamp = remove_timezone(timestamp)
|
|
180
184
|
|
|
181
|
-
if MIN_TIMESTAMP < timestamp <
|
|
185
|
+
if MIN_TIMESTAMP < timestamp < self.max_timestamp:
|
|
182
186
|
timestamps.append(timestamp)
|
|
183
187
|
|
|
184
188
|
if len(timestamps) >= MAX_EXTRACTED_TIMESTAMPS:
|
|
@@ -187,7 +191,7 @@ class OtherFile(Document):
|
|
|
187
191
|
self.warn(f"Error while iterating through datefinder.find_dates(): {e}")
|
|
188
192
|
|
|
189
193
|
if len(timestamps) == 0:
|
|
190
|
-
if not (self.is_duplicate
|
|
194
|
+
if not (self.is_duplicate or VAST_HOUSE in self.text):
|
|
191
195
|
self.log_top_lines(15, msg=f"No timestamps found")
|
|
192
196
|
|
|
193
197
|
return None
|
|
@@ -222,21 +226,21 @@ class OtherFile(Document):
|
|
|
222
226
|
|
|
223
227
|
for file in files:
|
|
224
228
|
link_and_info = [file.external_links_txt()]
|
|
225
|
-
date_str = file.date_str
|
|
229
|
+
date_str = file.date_str
|
|
226
230
|
|
|
227
|
-
if file.is_duplicate
|
|
228
|
-
preview_text = file.duplicate_file_txt
|
|
231
|
+
if file.is_duplicate:
|
|
232
|
+
preview_text = file.duplicate_file_txt
|
|
229
233
|
row_style = ' dim'
|
|
230
234
|
else:
|
|
231
|
-
link_and_info += file.info
|
|
235
|
+
link_and_info += file.info
|
|
232
236
|
preview_text = file.highlighted_preview_text()
|
|
233
237
|
row_style = ''
|
|
234
238
|
|
|
235
239
|
table.add_row(
|
|
236
240
|
Group(*link_and_info),
|
|
237
241
|
Text(date_str, style=TIMESTAMP_STYLE) if date_str else QUESTION_MARKS_TXT,
|
|
238
|
-
file.file_size_str
|
|
239
|
-
file.category_txt
|
|
242
|
+
file.file_size_str,
|
|
243
|
+
file.category_txt,
|
|
240
244
|
preview_text,
|
|
241
245
|
style=row_style
|
|
242
246
|
)
|
|
@@ -246,12 +250,12 @@ class OtherFile(Document):
|
|
|
246
250
|
@classmethod
|
|
247
251
|
def summary_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
|
|
248
252
|
"""Table showing file count by category."""
|
|
249
|
-
categories = uniquify([f.category
|
|
250
|
-
categories = sorted(categories, key=lambda c: -len([f for f in files if f.category
|
|
253
|
+
categories = uniquify([f.category for f in files])
|
|
254
|
+
categories = sorted(categories, key=lambda c: -len([f for f in files if f.category == c]))
|
|
251
255
|
table = cls.file_info_table(f'{title_pfx}Other Files Summary', 'Category')
|
|
252
256
|
|
|
253
257
|
for category in categories:
|
|
254
|
-
category_files = [f for f in files if f.category
|
|
258
|
+
category_files = [f for f in files if f.category == category]
|
|
255
259
|
table.add_row(styled_category(category), *cls.files_info_row(category_files))
|
|
256
260
|
|
|
257
261
|
table.columns = table.columns[:-2] + [table.columns[-1]] # Removee unknown author col
|