epstein-files 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. epstein_files/__init__.py +31 -18
  2. epstein_files/documents/communication.py +9 -5
  3. epstein_files/documents/document.py +225 -136
  4. epstein_files/documents/doj_file.py +242 -0
  5. epstein_files/documents/doj_files/full_text.py +166 -0
  6. epstein_files/documents/email.py +138 -163
  7. epstein_files/documents/emails/email_header.py +21 -11
  8. epstein_files/documents/emails/emailers.py +223 -0
  9. epstein_files/documents/imessage/text_message.py +2 -3
  10. epstein_files/documents/json_file.py +18 -14
  11. epstein_files/documents/messenger_log.py +23 -39
  12. epstein_files/documents/other_file.py +48 -44
  13. epstein_files/epstein_files.py +54 -33
  14. epstein_files/person.py +142 -110
  15. epstein_files/util/constant/names.py +29 -6
  16. epstein_files/util/constant/output_files.py +2 -0
  17. epstein_files/util/constant/strings.py +12 -6
  18. epstein_files/util/constant/urls.py +17 -0
  19. epstein_files/util/constants.py +101 -174
  20. epstein_files/util/data.py +2 -0
  21. epstein_files/util/doc_cfg.py +20 -15
  22. epstein_files/util/env.py +24 -16
  23. epstein_files/util/file_helper.py +28 -6
  24. epstein_files/util/helpers/debugging_helper.py +13 -0
  25. epstein_files/util/helpers/env_helpers.py +21 -0
  26. epstein_files/util/highlighted_group.py +57 -16
  27. epstein_files/util/layout/left_bar_panel.py +26 -0
  28. epstein_files/util/logging.py +28 -13
  29. epstein_files/util/output.py +33 -10
  30. epstein_files/util/rich.py +28 -2
  31. epstein_files/util/word_count.py +7 -7
  32. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/METADATA +14 -1
  33. epstein_files-1.5.0.dist-info/RECORD +40 -0
  34. epstein_files-1.4.1.dist-info/RECORD +0 -34
  35. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
  36. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
  37. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +0 -0
@@ -177,6 +177,16 @@ ZUBAIR_KHAN = 'Zubair Khan'
177
177
 
178
178
  UNKNOWN = '(unknown)'
179
179
 
180
+ # DOJ files emails
181
+ ALISON_J_NATHAN = 'Alison J. Nathan'
182
+ AMIR_TAAKI = 'Amir Taaki'
183
+ BROCK_PIERCE = 'Brock Pierce'
184
+ CHRISTIAN_EVERDELL = 'Christian Everdell'
185
+ CHRISTOPHER_DILORIO = 'Christopher Dilorio'
186
+ DOUGLAS_WIGDOR = 'Douglas Wigdor'
187
+ KARYNA_SHULIAK = 'Karyna Shuliak'
188
+ STACEY_RICHMAN = 'Stacey Richman'
189
+
180
190
  # No communications but name is in the files
181
191
  BILL_GATES = 'Bill Gates'
182
192
  DONALD_TRUMP = 'Donald Trump'
@@ -216,10 +226,10 @@ UBS = 'UBS'
216
226
 
217
227
  # First and last names that should be made part of a highlighting regex for emailers
218
228
  NAMES_TO_NOT_HIGHLIGHT = """
219
- al alain alan alfredo allen alex alexander amanda andres andrew anthony
229
+ al alain alan alison alfredo allen alex alexander amanda andres andrew anthony
220
230
  bard barrett barry bennet bernard bill black bob boris brad brenner bruce
221
- cameron caroline carolyn chris christina cohen
222
- dan daniel danny darren dave david debbie donald
231
+ cameron caroline carolyn chris christian christina cohen
232
+ dan daniel danny darren dave david debbie donald douglas
223
233
  ed edward edwards enforcement enterprise enterprises entourage epstein eric erika etienne
224
234
  faith fisher forget fred friendly frost fuller
225
235
  gates gerald george gold gordon
@@ -229,11 +239,11 @@ NAMES_TO_NOT_HIGHLIGHT = """
229
239
  kafka kahn karl kate katherine kelly ken kevin krassner
230
240
  larry larsen laurie lawrence leon lesley linda link lisa
231
241
  mann marc marie mark martin matthew melanie michael mike miller mitchell miles morris moskowitz
232
- nancy neal new nicole norman
242
+ nancy nathan neal new nicole norman
233
243
  owen
234
- paul paula pen peter philip prince
244
+ paul paula pen peter philip pierce prince
235
245
  randall rangel reid richard robert rodriguez roger rosenberg ross roth roy rubenstein rubin
236
- scott sean skip smith stanley stern stephen steve steven stone susan
246
+ scott sean skip smith stacey stanley stern stephen steve steven stone susan
237
247
  terry the thomas tim tom tony tyler
238
248
  victor
239
249
  wade waters
@@ -304,7 +314,20 @@ def extract_last_name(name: str) -> str:
304
314
  return first_last_names[-1]
305
315
 
306
316
 
317
+ def reverse_first_and_last_names(name: str) -> str:
318
+ """If there's a comma in the name in the style 'Lastname, Firstname', reverse it and remove comma."""
319
+ if '@' in name:
320
+ return name.lower()
321
+
322
+ if ', ' in name:
323
+ names = name.split(', ')
324
+ return f"{names[1]} {names[0]}"
325
+ else:
326
+ return name
327
+
328
+
307
329
  def reversed_name(name: str) -> str:
330
+ """'Jeffrey Epstein' becomes 'Epstein Jeffrey'."""
308
331
  if ' ' not in name:
309
332
  return name
310
333
 
@@ -13,6 +13,7 @@ JSON_METADATA_PATH = HTML_DIR.joinpath(f'file_metadata_{EPSTEIN_FILES_NOV_2025}.
13
13
  TEXT_MSGS_HTML_PATH = HTML_DIR.joinpath('index.html')
14
14
  WORD_COUNT_HTML_PATH = HTML_DIR.joinpath(f'communication_word_count_{EPSTEIN_FILES_NOV_2025}.html')
15
15
  # EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
16
+ DOJ_2026_HTML_PATH = HTML_DIR.joinpath('doj_2026-01-30_files.html')
16
17
  URLS_ENV = '.urls.env'
17
18
  EMAILERS_TABLE_PNG_PATH = HTML_DIR.joinpath('emailers_info_table.png')
18
19
 
@@ -26,6 +27,7 @@ CHRONOLOGICAL_EMAILS_URL = f"{TEXT_MSGS_URL}/{CHRONOLOGICAL_EMAILS_PATH.name}"
26
27
  JSON_FILES_URL = f"{TEXT_MSGS_URL}/{JSON_FILES_JSON_PATH.name}"
27
28
  JSON_METADATA_URL = f"{TEXT_MSGS_URL}/{JSON_METADATA_PATH.name}"
28
29
  WORD_COUNT_URL = f"{TEXT_MSGS_URL}/{WORD_COUNT_HTML_PATH.name}"
30
+ DOJ_2026_URL = f"{TEXT_MSGS_URL}/{DOJ_2026_HTML_PATH.name}"
29
31
 
30
32
  SITE_URLS: dict[SiteType, str] = {
31
33
  EMAIL: ALL_EMAILS_URL,
@@ -57,25 +57,31 @@ TIMESTAMP_DIM = f"turquoise4 dim"
57
57
  # Misc
58
58
  AUTHOR = 'author'
59
59
  DEFAULT = 'default'
60
+ EFTA_PREFIX = 'EFTA'
60
61
  HOUSE_OVERSIGHT_PREFIX = 'HOUSE_OVERSIGHT_'
61
62
  JSON = 'json'
62
63
  NA = 'n/a'
63
64
  REDACTED = '<REDACTED>'
64
65
  QUESTION_MARKS = '(???)'
65
66
 
66
- # Regexes
67
- ID_REGEX = re.compile(r"\d{6}(_\d{1,2})?")
68
- FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}({ID_REGEX.pattern})")
69
- FILE_NAME_REGEX = re.compile(fr"{FILE_STEM_REGEX.pattern}(\.txt(\.json)?)?")
70
- QUESTION_MARKS_REGEX = re.compile(fr' {re.escape(QUESTION_MARKS)}$')
71
-
72
67
  # Document subclass names (this sucks)
73
68
  DOCUMENT_CLASS = 'Document'
69
+ DOJ_FILE_CLASS = 'DojFile'
74
70
  EMAIL_CLASS = 'Email'
75
71
  JSON_FILE_CLASS = 'JsonFile'
76
72
  MESSENGER_LOG_CLASS = 'MessengerLog'
77
73
  OTHER_FILE_CLASS = 'OtherFile'
78
74
 
75
+ # Regexes
76
+ DOJ_FILE_STEM_REGEX = re.compile(fr"{EFTA_PREFIX}\d{{8}}")
77
+ DOJ_FILE_NAME_REGEX = re.compile(fr"{DOJ_FILE_STEM_REGEX.pattern}(\.txt)?")
78
+
79
+ HOUSE_OVERSIGHT_NOV_2025_ID_REGEX = re.compile(r"\d{6}(_\d{1,2})?")
80
+ HOUSE_OVERSIGHT_NOV_2025_FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}({HOUSE_OVERSIGHT_NOV_2025_ID_REGEX.pattern})")
81
+ HOUSE_OVERSIGHT_NOV_2025_FILE_NAME_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_NOV_2025_FILE_STEM_REGEX.pattern}(\.txt(\.json)?)?")
82
+
83
+ QUESTION_MARKS_REGEX = re.compile(fr' {re.escape(QUESTION_MARKS)}$')
84
+
79
85
 
80
86
  remove_question_marks = lambda name: QUESTION_MARKS_REGEX.sub('', name).strip()
81
87
 
@@ -12,6 +12,7 @@ from epstein_files.util.file_helper import coerce_file_stem
12
12
 
13
13
  # Style stuff
14
14
  ARCHIVE_LINK_COLOR = 'slate_blue3'
15
+ ARCHIVE_ALT_LINK_STYLE = 'medium_purple4 italic'
15
16
  TEXT_LINK = 'text_link'
16
17
 
17
18
  # External site names
@@ -39,6 +40,9 @@ EPSTEIN_DOCS_URL = 'https://epstein-docs.github.io'
39
40
  OVERSIGHT_REPUBLICANS_PRESSER_URL = 'https://oversight.house.gov/release/oversight-committee-releases-additional-epstein-estate-documents/'
40
41
  RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL = 'https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_'
41
42
  SUBSTACK_URL = 'https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great'
43
+ # DOJ docs
44
+ DOJ_2026_URL = 'https://www.justice.gov/epstein/doj-disclosures'
45
+ DOJ_SEARCH_URL = 'https://www.justice.gov/epstein/search'
42
46
 
43
47
  # Document source sites
44
48
  EPSTEINIFY_URL = 'https://epsteinify.com'
@@ -53,6 +57,9 @@ DOC_LINK_BASE_URLS: dict[ExternalSite, str] = {
53
57
  ROLLCALL: f'https://rollcall.com/factbase/epstein/file?id=',
54
58
  }
55
59
 
60
+ # Example: https://www.justice.gov/epstein/files/DataSet%208/EFTA00009802.pdf
61
+ DOJ_2026_FILE_BASE_URL = "https://www.justice.gov/epstein/files/DataSet%20"
62
+
56
63
 
57
64
  epsteinify_api_url = lambda file_stem: f"{EPSTEINIFY_URL}/api/documents/{file_stem}"
58
65
  epsteinify_doc_link_markup = lambda filename_or_id, style = TEXT_LINK: external_doc_link_markup(EPSTEINIFY, filename_or_id, style)
@@ -90,6 +97,16 @@ def build_doc_url(base_url: str, filename_or_id: int | str, case: Literal['lower
90
97
  return f"{base_url}{file_stem}"
91
98
 
92
99
 
100
+ def doj_2026_file_url(dataset_id: int, file_stem: str) -> str:
101
+ """Link to justice.gov for a DOJ file."""
102
+ return f"{DOJ_2026_FILE_BASE_URL}{dataset_id}/{file_stem}.pdf"
103
+
104
+
105
+ def jmail_doj_2026_file_url(dataset_id: int, file_stem: str) -> str:
106
+ """Link to Jmail backup of DOJ file."""
107
+ return f"{JMAIL_URL}/drive/vol{dataset_id:05}-{file_stem.lower()}-pdf"
108
+
109
+
93
110
  def external_doc_link_markup(site: ExternalSite, filename_or_id: int | str, style: str = TEXT_LINK) -> str:
94
111
  url = build_doc_url(DOC_LINK_BASE_URLS[site], filename_or_id)
95
112
  return link_markup(url, coerce_file_stem(filename_or_id), style)
@@ -4,6 +4,7 @@ from typing import cast
4
4
 
5
5
  from dateutil.parser import parse
6
6
 
7
+ from epstein_files.documents.doj_files.full_text import EFTA00009622_TEXT
7
8
  from epstein_files.util.constant.names import *
8
9
  from epstein_files.util.constant.strings import *
9
10
  from epstein_files.util.doc_cfg import DocCfg, EmailCfg, TextCfg
@@ -56,172 +57,6 @@ HEADER_ABBREVIATIONS = {
56
57
  # Emailers Config Stuff #
57
58
  #########################
58
59
 
59
- # Emailers
60
- EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
61
- ALAN_DERSHOWITZ: re.compile(r'(alan.{1,7})?dershowi(lz?|t?z)|AlanDersh', re.IGNORECASE),
62
- ALIREZA_ITTIHADIEH: re.compile(r'Alireza.[Il]ttihadieh', re.IGNORECASE),
63
- AMANDA_ENS: re.compile(r'ens, amanda?|Amanda.Ens', re.IGNORECASE),
64
- ANAS_ALRASHEED: re.compile(r'anas\s*al\s*rashee[cd]', re.IGNORECASE),
65
- ANIL_AMBANI: re.compile(r'Anil.Ambani', re.IGNORECASE),
66
- ANN_MARIE_VILLAFANA: re.compile(r'Villafana, Ann Marie|(A(\.|nn) Marie )?Villafa(c|n|ri)a', re.IGNORECASE),
67
- ANTHONY_SCARAMUCCI: re.compile(r"mooch|(Anthony ('The Mooch' )?)?Scaramucci", re.IGNORECASE),
68
- ARIANE_DE_ROTHSCHILD: re.compile(r'AdeR|((Ariane|Edmond) (de )?)?Rothsh?ch?ild|Ariane(?!\s+Dwyer)', re.IGNORECASE),
69
- BARBRO_C_EHNBOM: re.compile(r'behnbom@aol.com|(Barbro\s.*)?Ehnbom', re.IGNORECASE),
70
- BARRY_J_COHEN: re.compile(r'barry\s*((j.?|james)\s*)?cohen?', re.IGNORECASE),
71
- BENNET_MOSKOWITZ: re.compile(r'Moskowitz.*Bennet|Bennet.*Moskowitz', re.IGNORECASE),
72
- BOB_CROWE: re.compile(r"[BR]ob Crowe", re.IGNORECASE),
73
- BORIS_NIKOLIC: re.compile(r'(boris )?nikolic?', re.IGNORECASE),
74
- BRAD_EDWARDS: re.compile(r'Brad(ley)?(\s*J(.?|ames))?\s*Edwards', re.IGNORECASE),
75
- BRAD_KARP: re.compile(r'Brad (S.? )?Karp|Karp, Brad', re.IGNORECASE),
76
- DANGENE_AND_JENNIE_ENTERPRISE: re.compile(r'Dangene and Jennie Enterprise?', re.IGNORECASE),
77
- DANNY_FROST: re.compile(r'Frost, Danny|frostd@dany.nyc.gov|Danny\s*Frost', re.IGNORECASE),
78
- DARREN_INDYKE: re.compile(r'darren$|Darren\s*(K\.?\s*)?[il]n[dq]_?yke?|dkiesq', re.IGNORECASE),
79
- DAVID_FISZEL: re.compile(r'David\s*Fis?zel', re.IGNORECASE),
80
- DAVID_HAIG: re.compile(fr'{DAVID_HAIG}|Haig, David', re.IGNORECASE),
81
- DAVID_STERN: re.compile(r'David Stern?', re.IGNORECASE),
82
- EDUARDO_ROBLES: re.compile(r'Ed(uardo)?\s*Robles', re.IGNORECASE),
83
- EDWARD_JAY_EPSTEIN: re.compile(r'(?<!Jeffrey )Edward (Jay )?Epstein', re.IGNORECASE),
84
- EHUD_BARAK: re.compile(r'(ehud|e?h)\s*barak|\behud', re.IGNORECASE),
85
- FAITH_KATES: re.compile(r'faith kates?', re.IGNORECASE),
86
- GERALD_BARTON: re.compile(r'Gerald.*Barton', re.IGNORECASE),
87
- GERALD_LEFCOURT: re.compile(r'Gerald\s*(B\.?\s*)?Lefcourt', re.IGNORECASE),
88
- GHISLAINE_MAXWELL: re.compile(r'g ?max(well)?|Ghislaine|Maxwell', re.IGNORECASE),
89
- HEATHER_MANN: re.compile(r'Heather Mann?', re.IGNORECASE),
90
- INTELLIGENCE_SQUARED: re.compile(r'intelligence\s*squared', re.IGNORECASE),
91
- JACKIE_PERCZEK: re.compile(r'jackie percze[kl]?', re.IGNORECASE),
92
- JABOR_Y: re.compile(r'[ji]abor\s*y?', re.IGNORECASE),
93
- JAMES_HILL: re.compile(r"hill, james e.|james.e.hill@abc.com", re.IGNORECASE),
94
- JANUSZ_BANASIAK: re.compile(r"Janu[is]z Banasiak", re.IGNORECASE),
95
- JEAN_HUGUEN: re.compile(r"Jean[\s.]Huguen", re.IGNORECASE),
96
- JEAN_LUC_BRUNEL: re.compile(r'Jean[- ]Luc Brunel?|JeanLuc', re.IGNORECASE),
97
- JEFF_FULLER: re.compile(r"jeff@mc2mm.com|Jeff Fuller", re.IGNORECASE),
98
- JEFFREY_EPSTEIN: re.compile(r'[djl]\s?ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeff(rey)? (Edward )?E((sp|ps)tein?)?( VI Foundation)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!(Mark L.|ard Jay) )Epstein', re.IGNORECASE),
99
- JESSICA_CADWELL: re.compile(r'Jessica Cadwell?', re.IGNORECASE),
100
- JOHNNY_EL_HACHEM: re.compile(r'el hachem johnny|johnny el hachem', re.IGNORECASE),
101
- JOI_ITO: re.compile(r'ji@media.mit.?edu|(joichi|joi)( Ito)?', re.IGNORECASE),
102
- JONATHAN_FARKAS: re.compile(r'Jonathan Fark(a|u)(s|il)', re.IGNORECASE),
103
- KATHRYN_RUEMMLER: re.compile(r'Kathr?yn? Ruemmler?', re.IGNORECASE),
104
- KEN_STARR: re.compile(r'starr, ken|Ken(neth\s*(W.\s*)?)?\s+starr?|starr', re.IGNORECASE),
105
- LANDON_THOMAS: re.compile(r'lando[nr] thomas( jr)?|thomas jr.?, lando[nr]', re.IGNORECASE),
106
- LARRY_SUMMERS: re.compile(r'(La(wrence|rry).{1,5})?Summers?|^LH$|LHS|[Il]hsofficel?', re.IGNORECASE),
107
- LAWRANCE_VISOSKI: re.compile(r'La(rry|wrance) Visoski?|Lvjet', re.IGNORECASE),
108
- LAWRENCE_KRAUSS: re.compile(r'Lawrence Kraus[es]?|[jl]awkrauss|kruase', re.IGNORECASE),
109
- LEON_BLACK: re.compile(r'Leon\s*Black?|(?<!Marc )Leon(?! (Botstein|Jaworski|Wieseltier))', re.IGNORECASE),
110
- LILLY_SANCHEZ: re.compile(r'Lilly.*Sanchez', re.IGNORECASE),
111
- LISA_NEW: re.compile(r'E?Lisa New?\b', re.IGNORECASE),
112
- MANUELA_MARTINEZ: re.compile(fr'Manuela (- Mega Partners|Martinez)', re.IGNORECASE),
113
- MARIANA_IDZKOWSKA: re.compile(r'Mariana [Il]d[źi]kowska?', re.IGNORECASE),
114
- MARK_EPSTEIN: re.compile(r'Mark (L\. )?(Epstein|Lloyd)', re.IGNORECASE),
115
- MARC_LEON: re.compile(r'Marc[.\s]+(Kensington|Leon)|Kensington2', re.IGNORECASE),
116
- MARTIN_NOWAK: re.compile(r'(Martin.*?)?No[vw]ak|Nowak, Martin', re.IGNORECASE),
117
- MARTIN_WEINBERG: re.compile(r'martin.*?weinberg', re.IGNORECASE),
118
- "Matthew Schafer": re.compile(r"matthew\.?schafer?", re.IGNORECASE),
119
- MELANIE_SPINELLA: re.compile(r'M?elanie Spine[Il]{2}a', re.IGNORECASE),
120
- MICHAEL_BUCHHOLTZ: re.compile(r'Michael.*Buchholtz', re.IGNORECASE),
121
- MICHAEL_MILLER: re.compile(r'Micha(el)? Miller|Miller, Micha(el)?', re.IGNORECASE),
122
- MICHAEL_SITRICK: re.compile(r'(Mi(chael|ke).{0,5})?[CS]itrick', re.IGNORECASE),
123
- MICHAEL_WOLFF: re.compile(r'Michael\s*Wol(f[ef]e?|i)|Wolff', re.IGNORECASE),
124
- MIROSLAV_LAJCAK: re.compile(r"Miro(slav)?(\s+Laj[cč][aá]k)?"),
125
- MOHAMED_WAHEED_HASSAN: re.compile(r'Mohamed Waheed(\s+Hassan)?', re.IGNORECASE),
126
- NADIA_MARCINKO: re.compile(r"Na[dď]i?a\s+Marcinko(v[aá])?", re.IGNORECASE),
127
- NEAL_KASSELL: re.compile(r'Neal\s*Kassell?', re.IGNORECASE),
128
- NICHOLAS_RIBIS: re.compile(r'Nic(holas|k)[\s._]Ribi?s?|Ribbis', re.IGNORECASE),
129
- OLIVIER_COLOM: re.compile(fr'Colom, Olivier|{OLIVIER_COLOM}', re.IGNORECASE),
130
- PAUL_BARRETT: re.compile(r'Paul Barre(d|tt)', re.IGNORECASE),
131
- PAUL_KRASSNER: re.compile(r'Pa\s?ul Krassner', re.IGNORECASE),
132
- PAUL_MORRIS: re.compile(r'morris, paul|Paul Morris', re.IGNORECASE),
133
- PAULA: re.compile(r'^Paula( Heil Fisher)?$', re.IGNORECASE),
134
- PEGGY_SIEGAL: re.compile(r'Peggy Siegal?', re.IGNORECASE),
135
- PETER_ATTIA: re.compile(r'Peter Attia?', re.IGNORECASE),
136
- PETER_MANDELSON: re.compile(r"((Lord|Peter) )?Mandelson", re.IGNORECASE),
137
- 'pink@mc2mm.com': re.compile(r"^Pink$|pink@mc2mm\.com", re.IGNORECASE),
138
- PRINCE_ANDREW: re.compile(r'Prince Andrew|The Duke', re.IGNORECASE),
139
- REID_WEINGARTEN: re.compile(r'Weingarten, Rei[cdi]|Rei[cdi] Weingarten', re.IGNORECASE),
140
- RICHARD_KAHN: re.compile(r'rich(ard)? kahn?', re.IGNORECASE),
141
- ROBERT_D_CRITTON_JR: re.compile(r'Robert D.? Critton,? Jr.?', re.IGNORECASE),
142
- ROBERT_LAWRENCE_KUHN: re.compile(r'Robert\s*(Lawrence)?\s*Kuhn', re.IGNORECASE),
143
- ROBERT_TRIVERS: re.compile(r'tri[vy]ersr@gmail|Robert\s*Trivers?', re.IGNORECASE),
144
- ROSS_GOW: re.compile(fr"Ross(acuity)? Gow|(ross@)?acuity\s*reputation(\.com)?", re.IGNORECASE),
145
- SAMUEL_LEFF: re.compile(r"Sam(uel)?(/Walli)? Leff", re.IGNORECASE),
146
- SCOTT_J_LINK: re.compile(r'scott j. link?', re.IGNORECASE),
147
- SEAN_BANNON: re.compile(r'sean bannon?', re.IGNORECASE),
148
- SHAHER_ABDULHAK_BESHER: re.compile(r'\bShaher( Abdulhak Besher)?\b', re.IGNORECASE),
149
- SOON_YI_PREVIN: re.compile(r'Soon[- ]Yi Previn?', re.IGNORECASE),
150
- STEPHEN_HANSON: re.compile(r'ste(phen|ve) hanson?|Shanson900', re.IGNORECASE),
151
- STEVE_BANNON: re.compile(r'steve banno[nr]?', re.IGNORECASE),
152
- STEVEN_SINOFSKY: re.compile(r'Steven Sinofsky?', re.IGNORECASE),
153
- SULTAN_BIN_SULAYEM: re.compile(r'Sultan (Ahmed )?bin Sulaye?m?', re.IGNORECASE),
154
- TERJE_ROD_LARSEN: re.compile(r"Terje(( (R[øo]e?d[- ])?)?Lars[eo]n)?", re.IGNORECASE),
155
- TERRY_KAFKA: re.compile(r'Terry Kafka?', re.IGNORECASE),
156
- THANU_BOONYAWATANA: re.compile(r"Thanu (BOONYAWATANA|Cnx)", re.IGNORECASE),
157
- THORBJORN_JAGLAND: re.compile(r'(Thor.{3,8})?Jag[il]and?', re.IGNORECASE),
158
- TONJA_HADDAD_COLEMAN: re.compile(r"To(nj|rl)a Haddad Coleman|haddadfm@aol.com", re.IGNORECASE),
159
- VINCENZO_IOZZO: re.compile(r"Vincenzo [IL]ozzo", re.IGNORECASE),
160
- }
161
-
162
- # If found as substring consider them the author
163
- EMAILERS = [
164
- 'Anne Boyles',
165
- AL_SECKEL,
166
- 'Ariane Dwyer',
167
- AZIZA_ALAHMADI,
168
- BILL_GATES,
169
- BILL_SIEGEL,
170
- BRAD_WECHSLER,
171
- CHRISTINA_GALBRAITH,
172
- DANIEL_SABBA,
173
- 'Danny Goldberg',
174
- DAVID_SCHOEN,
175
- DEBBIE_FEIN,
176
- DEEPAK_CHOPRA,
177
- GLENN_DUBIN,
178
- GORDON_GETTY,
179
- 'Kevin Bright',
180
- 'Jack Lang',
181
- JACK_SCAROLA,
182
- JAY_LEFKOWITZ,
183
- JES_STALEY,
184
- JOHN_PAGE,
185
- 'Jokeland',
186
- JOSCHA_BACH,
187
- 'Kathleen Ruderman',
188
- KENNETH_E_MAPP,
189
- 'Larry Cohen',
190
- LESLEY_GROFF,
191
- 'lorraine@mc2mm.com',
192
- LINDA_STONE,
193
- 'Lyn Fontanilla',
194
- MARK_TRAMO,
195
- MELANIE_WALKER,
196
- MERWIN_DELA_CRUZ,
197
- 'Michael Simmons', # Not the only "To:"
198
- 'middle.east.update@hotmail.com',
199
- 'Nancy Cain',
200
- 'Nancy Dahl',
201
- 'Nancy Portland',
202
- 'Oliver Goodenough',
203
- 'Peter Aldhous',
204
- 'Peter Green',
205
- ROGER_SCHANK,
206
- 'Roy Black',
207
- STEVEN_PFEIFFER,
208
- 'Steven Victor MD',
209
- 'Susan Edelman',
210
- TOM_BARRACK,
211
- 'Vahe Stepanian',
212
- 'Vladimir Yudashkin',
213
- ]
214
-
215
- EMAILER_REGEXES = deepcopy(EMAILER_ID_REGEXES) # Keep a copy without the simple EMAILERS regexes
216
-
217
- # Add simple matching regexes for EMAILERS entries to EMAILER_REGEXES
218
- for emailer in EMAILERS:
219
- if emailer in EMAILER_REGEXES:
220
- raise RuntimeError(f"Can't overwrite emailer regex for '{emailer}'")
221
-
222
- EMAILER_REGEXES[emailer] = re.compile(emailer, re.IGNORECASE)
223
-
224
-
225
60
  # Atribution reasons
226
61
  BOLOTOVA_REASON = 'Same signature style as 029020 ("--" followed by "Sincerely Renata Bolotova")'
227
62
  KATHY_REASON = 'from "Kathy" about dems, sent from iPad'
@@ -459,7 +294,6 @@ EMAILS_CONFIG = [
459
294
  EmailCfg(id='026287', author=DAVID_SCHOEN, attribution_reason='Signature'),
460
295
  EmailCfg(id='033419', author=DAVID_SCHOEN, attribution_reason='Signature'),
461
296
  EmailCfg(id='031460', author=EDWARD_JAY_EPSTEIN, attribution_reason='quoted reply has edwardjayepstein.com', is_fwded_article=True),
462
- EmailCfg(id='031607', is_fwded_article=True, comment='Epstein reply to Edward Jay Epstein'),
463
297
  EmailCfg(
464
298
  id='030475',
465
299
  author=FAITH_KATES,
@@ -553,7 +387,12 @@ EMAILS_CONFIG = [
553
387
  EmailCfg(id='026609', author='Mark Green', attribution_reason='Actually a fwd, Mark Green is in signature'),
554
388
  EmailCfg(id='030472', author=MARTIN_WEINBERG, attribution_reason='Maybe. in reply', is_attribution_uncertain=True),
555
389
  EmailCfg(id='032563', author=MASHA_DROKOVA, attribution_reason='replied to in 033014'),
556
- EmailCfg(id='032564', author=MASHA_DROKOVA, attribution_reason='follow up to 032563 about huffpo article with link'),
390
+ EmailCfg(
391
+ id='032564',
392
+ attribution_reason='follow up to 032563 about huffpo article with link',
393
+ author=MASHA_DROKOVA,
394
+ description='an archived version of the HuffPost link is here: https://archive.is/hJxT3 '
395
+ ),
557
396
  EmailCfg(id='031544', author=MASHA_DROKOVA, attribution_reason='follow up to 032563 about huffpo article with link'),
558
397
  EmailCfg(id='032605', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
559
398
  EmailCfg(id='032606', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
@@ -657,7 +496,7 @@ EMAILS_CONFIG = [
657
496
  EmailCfg(id='026431', recipients=[ARIANE_DE_ROTHSCHILD], attribution_reason='Reply'),
658
497
  EmailCfg(id='032876', recipients=[CECILIA_STEEN], attribution_reason='unredacted in 032267'),
659
498
  EmailCfg(id='026466', recipients=[DIANE_ZIMAN], attribution_reason='Quoted reply'),
660
- EmailCfg(id='031607', recipients=[EDWARD_JAY_EPSTEIN], attribution_reason='quoted reply has edwardjayepstein.com'),
499
+ EmailCfg(id='031607', recipients=[EDWARD_JAY_EPSTEIN], is_fwded_article=True, attribution_reason='quoted reply has edwardjayepstein.com'),
661
500
  EmailCfg(
662
501
  id='030525',
663
502
  recipients=[FAITH_KATES],
@@ -1091,6 +930,40 @@ EMAILS_CONFIG = [
1091
930
  EmailCfg(id='027028', truncate_to=1000, comment='Tom Pritzer penny pritzker'),
1092
931
  EmailCfg(id='029910', truncate_to=NO_TRUNCATE, comment='Tom Pritzer Aspen'),
1093
932
  EmailCfg(id='025163', truncate_to=NO_TRUNCATE, comment='Tom Pritzer'),
933
+
934
+ # DOJ files
935
+ EmailCfg(id='EFTA00935996', recipients=[RENATA_BOLOTOVA], attribution_reason='"sneaky dog"'),
936
+ EmailCfg(id='EFTA02731737', date='2023-06-30T16:05:00'),
937
+ EmailCfg(id='EFTA02731689', author=UNKNOWN, recipients=[None], date='2023-06-09 20:14:00'),
938
+ EmailCfg(id='EFTA02731475', date='2023-05-31T20:53:00'),
939
+ EmailCfg(id='EFTA02731732', date='2024-03-06T12:21:00'),
940
+ EmailCfg(id='EFTA02731485', date='2023-06-12T13:53:00'),
941
+ EmailCfg(id='EFTA02731617', date='2021-04-28T15:05:41'),
942
+ EmailCfg(id='EFTA02730483', date='2023-07-11T08:25:00'), # TODO: actually reply timewtamp
943
+ EmailCfg(id='EFTA02730481', date='2023-07-07T11:01:00'), # TODO: actually reply timewtamp
944
+ EmailCfg(id='EFTA02731754', date='2024-03-06T23:24:00'), # TODO: actually reply timewtamp
945
+ EmailCfg(id='EFTA02731735', date='2024-03-04T05:04:00'), # TODO: actually reply timewtamp
946
+ EmailCfg(id='EFTA02731577', date='2024-10-16T00:00:00'), # TODO: actually reply timewtamp
947
+ EmailCfg(id='EFTA02730468', date='2019-07-11T08:25:00'), # TODO: This is just wrong
948
+ # Generated basd on OtheFile extract_timestamp()
949
+ EmailCfg(id='EFTA02731783', date='2022-01-21 17:28:00'),
950
+ EmailCfg(id='EFTA02731587', date='2022-01-21 17:28:00'),
951
+ EmailCfg(id='EFTA02731729', date='2021-08-17 00:00:00'),
952
+ EmailCfg(id='EFTA02731578', date='2021-05-28 10:00:00'),
953
+ EmailCfg(id='EFTA02730473', date='2013-04-24 16:32:00'),
954
+ EmailCfg(id='EFTA02731699', date='2021-05-27 10:19:00'),
955
+ EmailCfg(id='EFTA02731583', date='2022-01-21 17:28:00'),
956
+ EmailCfg(id='EFTA02731552', date='2021-05-26 16:12:00'),
957
+ EmailCfg(id='EFTA00039888', date='2019-05-14 16:49:00'),
958
+ EmailCfg(id='EFTA02731684', date='2021-05-11 15:27:00'),
959
+ EmailCfg(id='EFTA02731697', date='2021-06-07 17:33:00'),
960
+ EmailCfg(id='EFTA02731733', date='2021-05-17 17:29:00'),
961
+ EmailCfg(id='EFTA00040145', date='2021-11-09 17:24:30'),
962
+ # EmailCfg(id='EFTA02730468', date='2004-02-03 00:00:00'), # TODO: ???
963
+ EmailCfg(id='EFTA02731528', date='2021-05-06 09:39:15'),
964
+ EmailCfg(id='EFTA02730485', date='2021-12-03 00:00:00'),
965
+ EmailCfg(id='EFTA00039689', truncate_to=NO_TRUNCATE),
966
+ EmailCfg(id='EFTA00995559', author=RENATA_BOLOTOVA, attribution_reason='poorly redacted signature'),
1094
967
  ]
1095
968
 
1096
969
  if args.constantize:
@@ -1163,7 +1036,7 @@ ZUBAIR_AND_ANYA = f"{ZUBAIR_KHAN} and Anya Rasulova"
1163
1036
  OTHER_FILES_BOOKS = [
1164
1037
  DocCfg(id='017088', author=ALAN_DERSHOWITZ, description=f'"Taking the Stand: My Life in the Law" (draft)'),
1165
1038
  DocCfg(id='013501', author='Arnold J. Mandell', description=f'The Nearness Of Grace: A Personal Science Of Spiritual Transformation', date='2005-01-01'),
1166
- DocCfg(id='012899', author='Ben Goertzel', description=f'Engineering General Intelligence: A Path to Advanced AGI Via Embodied Learning and Cognitive Synergy'),
1039
+ DocCfg(id='012899', author='Ben Goertzel', description=f'Engineering General Intelligence: A Path to Advanced AGI Via Embodied Learning and Cognitive Synergy', date='2013-09-19'),
1167
1040
  DocCfg(id='018438', author='Clarisse Thorn', description=f'The S&M Feminist'),
1168
1041
  DocCfg(id='019477', author=EDWARD_JAY_EPSTEIN, description=f'How America Lost Its Secrets: Edward Snowden, the Man, and the Theft'),
1169
1042
  DocCfg(id='020153', author=EDWARD_JAY_EPSTEIN, description=f'The Snowden Affair: A Spy Story In Six Parts'),
@@ -1475,6 +1348,10 @@ OTHER_FILES_LEGAL = [
1475
1348
  DocCfg(id='028540', author='SCOTUS', description=f"decision in Budha Ismail Jam et al. v. INTERNATIONAL FINANCE CORP"),
1476
1349
  DocCfg(id='012197', author='SDFL', description=f"response to {JAY_LEFKOWITZ} on Epstein Plea Agreement Compliance"),
1477
1350
  DocCfg(id='022277', description=f"text of National Labour Relations Board (NLRB) law", is_interesting=False),
1351
+
1352
+ # DOJ files
1353
+ DocCfg(id='EFTA00007157', description='victim list and police log'),
1354
+ DocCfg(id='EFTA02730274', description='evidence inventory that appears to have since been deleted from the DOJ website'),
1478
1355
  ]
1479
1356
 
1480
1357
  OTHER_FILES_CONFERENCES = [
@@ -1585,7 +1462,12 @@ OTHER_FILES_FINANCE = [
1585
1462
  DocCfg(id='024132', author=JP_MORGAN, description=JP_MORGAN_EYE_ON_THE_MARKET, date='2012-03-15'),
1586
1463
  DocCfg(id='024194', author=JP_MORGAN, description=JP_MORGAN_EYE_ON_THE_MARKET, date='2012-10-22'),
1587
1464
  DocCfg(id='025296', author='Laffer Associates', description=f'report predicting Trump win', date='2016-07-06'),
1588
- DocCfg(id='020824', author='Mary Meeker', description=f"USA Inc: A Basic Summary of America's Financial Statements compiled", date='2011-02-01'),
1465
+ DocCfg(
1466
+ id='020824',
1467
+ author='Mary Meeker',
1468
+ date='2011-02-01',
1469
+ description=f"USA Inc: A Basic Summary of America's Financial Statements compiled",
1470
+ ),
1589
1471
  DocCfg(id='025551', author='Morgan Stanley', description=f'report about alternative asset managers', date='2018-01-30'),
1590
1472
  DocCfg(id='019856', author='Sadis Goldberg LLP', description=f"report on SCOTUS ruling about insider trading", is_interesting=True),
1591
1473
  DocCfg(id='025763', author='S&P', description=f"Economic Research: How Increasing Income Inequality Is Dampening U.S. Growth", date='2014-08-05'),
@@ -1594,12 +1476,20 @@ OTHER_FILES_FINANCE = [
1594
1476
  DocCfg(id='026584', description=f"article about tax implications of disregarded entities", date='2009-07-01', is_interesting=True),
1595
1477
  DocCfg(
1596
1478
  id='024271',
1597
- description=f"Blockchain Capital and Brock Pierce pitch deck",
1598
1479
  date='2015-10-01',
1480
+ description=f"Blockchain Capital and Brock Pierce pitch deck",
1599
1481
  is_interesting=True,
1600
1482
  ),
1601
- DocCfg(id='024817', description=f"Cowen's Collective View of CBD / Cannabis report"),
1602
- DocCfg(id='012048', description=f"{PRESS_RELEASE} 'Rockefeller Partners with Gregory J. Fleming to Create Independent Financial Services Firm' and other articles"),
1483
+ DocCfg(
1484
+ id='024817',
1485
+ date='2019-02-25',
1486
+ description=f"Cowen's Collective View of CBD / Cannabis report",
1487
+ is_interesting=True
1488
+ ),
1489
+ DocCfg(
1490
+ id='012048',
1491
+ description=f"{PRESS_RELEASE} 'Rockefeller Partners with Gregory J. Fleming to Create Independent Financial Services Firm' and other articles"
1492
+ ),
1603
1493
 
1604
1494
  # private placement memoranda
1605
1495
  DocCfg(
@@ -1668,6 +1558,11 @@ OTHER_FILES_PROPERTY = [
1668
1558
  description=f"{VIRGIN_ISLANDS} property deal pitch deck, building will be leased to the U.S. govt GSA",
1669
1559
  date='2014-06-01',
1670
1560
  ),
1561
+
1562
+ # DOJ files
1563
+ DocCfg(id='EFTA00001884', date='2019-03-14', description='photo of letter from Virgin Islands DOJ to Epstein'),
1564
+ DocCfg(id='EFTA00005783', date='2019-08-29', description='heavily redacted handwritten note and 30+ completely blacked out redacted pages'),
1565
+
1671
1566
  ]
1672
1567
 
1673
1568
  OTHER_FILES_REPUTATION = [
@@ -1881,6 +1776,32 @@ OTHER_FILES_MISC = [
1881
1776
  DocCfg(id='033434', description=f"{SCREENSHOT} iPhone chat labeled 'Edwards' at the top"),
1882
1777
  DocCfg(id='029475', description=f'{VIRGIN_ISLANDS} Twin City Mobile Integrated Health Services (TCMIH) proposal/request for donation'),
1883
1778
  DocCfg(id='029448', description=f"weird short essay titled 'President Obama and Self-Deception'"),
1779
+
1780
+ # DOJ files
1781
+
1782
+ DocCfg(id='EFTA00007781', description='paychecks signed by Epstein deposited at Colonial Bank'),
1783
+ DocCfg(id='EFTA00009622', description='handwritten note transcribed Claude AI', date='2006-07-19', replace_text_with=EFTA00009622_TEXT),
1784
+ DocCfg(id='EFTA00039295', replace_text_with='Bureau of Prisons inmate telephone privileges Program Statement'),
1785
+ DocCfg(
1786
+ id='EFTA00004477',
1787
+ replace_text_with='Epstein 50th birthday photo book 12 "THAIS, MOSCOW GIRLS, AFRICA, HAWAII, [REDACTED] [REDACTED], Zorro, [REDACTED] [REDACTED] [REDACTED], CRACK WHOLE PROPOSAL, BALI/THAILAND/ASIA, RUSSIA, [REDACTED], [REDACTED], NUDES, YOGAL GIRLS',
1788
+ ),
1789
+ DocCfg(id='EFTA00008120', replace_text_with='"Part II: The Art of Receiving a Massage"'),
1790
+ DocCfg(id='EFTA00008020', replace_text_with='"Massage for Dummies"'),
1791
+ DocCfg(id='EFTA00008220', replace_text_with='"Massage book: Chapter 11: Putting the Moves Together"'),
1792
+ DocCfg(id='EFTA00008320', replace_text_with='"Massage for Dummies (???)"'),
1793
+ DocCfg(id='EFTA00000476', replace_text_with='photo of JEFFREY EPSTEIN CASH DISBURSEMENTS for the month 2006-09'),
1794
+ DocCfg(id='EFTA00039312', replace_text_with='Bureau of Prisons Program Statement / Memo about BOP Pharmacy Program'),
1795
+ # Phone bills TODO: Some kind of special handling?
1796
+ DocCfg(id='EFTA00006387', replace_text_with='T-Mobile phone bill covering 2006-06-15 to 2006-07-23'),
1797
+ DocCfg(id='EFTA00007501', replace_text_with='T-Mobile phone bill from 2005'),
1798
+ DocCfg(id='EFTA00006587', replace_text_with='T-Mobile phone bill from 2006-09-04 to 2016-10-15'),
1799
+ DocCfg(id='EFTA00006687', replace_text_with='T-Mobile phone bill from 2006-10-31 to 2006-12-25'),
1800
+ DocCfg(id='EFTA00007401', replace_text_with='T-Mobile phone bill from 2004-08-25 to 2005-07-13'),
1801
+ DocCfg(id='EFTA00007301', replace_text_with='T-Mobile response to subpoena March 23, 2007 - Blackberry phone logs for 2005'),
1802
+ DocCfg(id='EFTA00006487', replace_text_with='T-Mobile phone bill 2006-08-26'),
1803
+ DocCfg(id='EFTA00006100', replace_text_with='Palm Beach Police fax machine activity log 2005-12-28 to 2006-01-04'),
1804
+ DocCfg(id='EFTA00007253', replace_text_with='T-Mobile response to subpoena March 23, 2007 - phone bill '),
1884
1805
  ]
1885
1806
 
1886
1807
  OTHER_FILES_JUNK = [
@@ -1894,6 +1815,12 @@ OTHER_FILES_JUNK = [
1894
1815
  DocCfg(id='029351', description=OBAMA_JOKE, date='2013-07-26'),
1895
1816
  DocCfg(id='029354', description=OBAMA_JOKE, date='2013-07-26'),
1896
1817
  DocCfg(id='031293'),
1818
+
1819
+ # Completely redacted DOJ emails, no timestamp at all
1820
+ DocCfg(id='EFTA02731726'),
1821
+ DocCfg(id='EFTA02731728'),
1822
+ # Almost no timestamp
1823
+ DocCfg(id='EFTA00003154'),
1897
1824
  ]
1898
1825
 
1899
1826
  OTHER_FILES_CATEGORIES = [
@@ -19,6 +19,8 @@ MULTINEWLINE_REGEX = re.compile(r"\n{2,}")
19
19
  CONSTANT_VAR_REGEX = re.compile(r"^[A-Z_]+$")
20
20
  ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTANT_VAR_REGEX.match(k)]
21
21
 
22
+ AMERICAN_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
23
+ AMERICAN_TIME_REGEX = re.compile(r"(\d{1,2}/\d{1,2}/\d{2,4}\s+\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)?)")
22
24
  PACIFIC_TZ = tz.gettz("America/Los_Angeles")
23
25
  TIMEZONE_INFO = {"PDT": PACIFIC_TZ, "PST": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
24
26
 
@@ -74,6 +74,7 @@ class DocCfg:
74
74
  duplicate_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
75
75
  is_interesting (bool | None): Override other considerations and always consider this file interesting (or not)
76
76
  is_synthetic (bool): True if this config was generated by the duplicate_cfgs() method
77
+ replace_text_with (bool): True if `description` should replace body of the document when printing.
77
78
  """
78
79
  id: str
79
80
  attached_to_email_id: str | None = None
@@ -88,11 +89,9 @@ class DocCfg:
88
89
  is_attribution_uncertain: bool = False
89
90
  is_interesting: bool | None = None
90
91
  is_synthetic: bool = False
92
+ replace_text_with: str = ''
91
93
 
92
- def __post_init__(self):
93
- if self.duplicate_of_id or self.duplicate_ids:
94
- self.dupe_type = self.dupe_type or SAME
95
-
94
+ @property
96
95
  def complete_description(self) -> str | None:
97
96
  """String that summarizes what is known about this document."""
98
97
  description = ''
@@ -130,17 +129,7 @@ class DocCfg:
130
129
 
131
130
  return description
132
131
 
133
- def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
134
- """Create synthetic DocCfg objects that set the 'duplicate_of_id' field to point back to this object."""
135
- for id in self.duplicate_ids:
136
- dupe_cfg = deepcopy(self)
137
- dupe_cfg.id = id
138
- dupe_cfg.duplicate_of_id = self.id
139
- dupe_cfg.duplicate_ids = []
140
- dupe_cfg.dupe_type = self.dupe_type
141
- dupe_cfg.is_synthetic = True
142
- yield dupe_cfg
143
-
132
+ @property
144
133
  def metadata(self) -> Metadata:
145
134
  metadata = {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
146
135
 
@@ -149,10 +138,26 @@ class DocCfg:
149
138
 
150
139
  return metadata
151
140
 
141
+ @property
152
142
  def timestamp(self) -> datetime | None:
153
143
  if self.date:
154
144
  return parse(self.date)
155
145
 
146
+ def __post_init__(self):
147
+ if self.duplicate_of_id or self.duplicate_ids:
148
+ self.dupe_type = self.dupe_type or SAME
149
+
150
+ def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
151
+ """Create synthetic DocCfg objects that set the 'duplicate_of_id' field to point back to this object."""
152
+ for id in self.duplicate_ids:
153
+ dupe_cfg = deepcopy(self)
154
+ dupe_cfg.id = id
155
+ dupe_cfg.duplicate_of_id = self.id
156
+ dupe_cfg.duplicate_ids = []
157
+ dupe_cfg.dupe_type = self.dupe_type
158
+ dupe_cfg.is_synthetic = True
159
+ yield dupe_cfg
160
+
156
161
  def _props_strs(self) -> list[str]:
157
162
  props = []
158
163
  add_prop = lambda f, value: props.append(f"{f.name}={value}")