epstein-files 1.0.13__py3-none-any.whl → 1.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,7 +18,7 @@ from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_R
18
18
  from epstein_files.util.constant.strings import *
19
19
  from epstein_files.util.constants import *
20
20
  from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg, Metadata
21
- from epstein_files.util.data import escape_single_quotes, remove_timezone, sort_dict, uniquify
21
+ from epstein_files.util.data import days_between, escape_single_quotes, remove_timezone, sort_dict, uniquify
22
22
  from epstein_files.util.file_helper import FILENAME_LENGTH, file_size_to_str
23
23
  from epstein_files.util.env import args
24
24
  from epstein_files.util.highlighted_group import styled_category
@@ -36,94 +36,62 @@ TIMESTAMP_LOG_INDENT = f'{LOG_INDENT} '
36
36
  VAST_HOUSE = 'vast house' # Michael Wolff article draft about Epstein indicator
37
37
  VI_DAILY_NEWS_REGEX = re.compile(r'virgin\s*is[kl][ai]nds\s*daily\s*news', re.IGNORECASE)
38
38
 
39
- UNINTERESTING_CATEGORES = [
39
+ SKIP_TIMESTAMP_EXTRACT = [
40
+ PALM_BEACH_TSV,
41
+ PALM_BEACH_PROPERTY_INFO,
42
+ ]
43
+
44
+ UNINTERESTING_CATEGORIES = [
45
+ ACADEMIA,
46
+ ARTICLE,
40
47
  ARTS,
41
48
  BOOK,
49
+ CONFERENCE,
42
50
  JUNK,
51
+ POLITICS,
43
52
  SKYPE_LOG,
44
- SPEECH,
45
53
  ]
46
54
 
47
55
  # OtherFiles whose descriptions/info match these prefixes are not displayed unless --all-other-files is used
48
- UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
56
+ UNINTERESTING_PREFIXES = [
49
57
  'article about',
50
- ARTICLE_DRAFT,
51
- 'Aviation International',
52
- BBC,
53
- BLOOMBERG,
54
- 'Boston Globe',
55
58
  BROCKMAN_INC,
56
- CHINA_DAILY,
57
- CNN,
58
- 'completely redacted',
59
59
  CVRA,
60
- DAILY_MAIL,
61
- DAILY_TELEGRAPH,
62
- CVRA_LEXIS_SEARCH[0:-12], # Because date at end :(
63
60
  DERSH_GIUFFRE_TWEET,
64
- 'Financial Times',
65
- 'Forbes',
66
- 'Frontlines',
67
- 'Future Science',
68
- 'Globe and Mail',
69
61
  GORDON_GETTY,
70
62
  f"{HARVARD} Econ",
71
63
  HARVARD_POETRY,
72
- 'Inference',
73
64
  JASTA,
74
- 'JetGala',
75
- JOHN_BOLTON_PRESS_CLIPPING,
76
- 'Journal of Criminal',
77
- LA_TIMES,
78
- 'Litigation Daily',
79
- LAWRENCE_KRAUSS,
80
- LAWRENCE_KRAUSS_ASU_ORIGINS,
81
- 'MarketWatch',
82
- MARTIN_NOWAK,
83
- 'Morning News',
65
+ LEXIS_NEXIS,
84
66
  NOBEL_CHARITABLE_TRUST,
85
- 'Nautilus',
86
- 'New Yorker',
87
- NYT,
88
67
  PALM_BEACH_CODE_ENFORCEMENT,
89
- PALM_BEACH_DAILY_NEWS,
90
- PALM_BEACH_POST,
91
68
  PALM_BEACH_TSV,
92
69
  PALM_BEACH_WATER_COMMITTEE,
93
- PAUL_KRASSNER,
94
- PEGGY_SIEGAL,
95
- 'Politifact',
96
- 'Rafanelli',
97
- ROBERT_LAWRENCE_KUHN,
98
- ROBERT_TRIVERS,
99
- 'SCMP',
100
- 'SciencExpress',
101
- 'Scowcroft',
102
- SHIMON_POST_ARTICLE,
103
- SINGLE_PAGE,
104
- STACEY_PLASKETT,
105
- 'Tatler',
106
- TERJE_ROD_LARSEN,
107
- TEXT_OF_US_LAW,
108
- TRANSLATION,
109
70
  TWEET,
110
- REAL_DEAL_ARTICLE,
111
- TRUMP_DISCLOSURES,
112
- UBS_CIO_REPORT,
113
71
  UN_GENERAL_ASSEMBLY,
114
- 'U.S. News',
115
72
  'US Office',
116
- 'Vanity Fair',
117
- VI_DAILY_NEWS,
118
- WAPO,
73
+ ]
74
+
75
+ INTERESTING_AUTHORS = [
76
+ EDWARD_JAY_EPSTEIN,
77
+ EHUD_BARAK,
78
+ JOI_ITO,
79
+ NOAM_CHOMSKY,
80
+ MICHAEL_WOLFF,
81
+ SVETLANA_POZHIDAEVA,
119
82
  ]
120
83
 
121
84
 
122
85
  @dataclass
123
86
  class OtherFile(Document):
124
- """File that is not an email, an iMessage log, or JSON data."""
87
+ """
88
+ File that is not an email, an iMessage log, or JSON data.
125
89
 
126
- include_description_in_summary_panel: ClassVar[bool] = True
90
+ Attributes:
91
+ was_timestamp_extracted (bool): True if the timestamp was programmatically extracted (and could be wrong)
92
+ """
93
+ was_timestamp_extracted: bool = False
94
+ include_description_in_summary_panel: ClassVar[bool] = True # Class var for logging output
127
95
 
128
96
  def __post_init__(self):
129
97
  super().__post_init__()
@@ -162,11 +130,13 @@ class OtherFile(Document):
162
130
  elif len(info_sentences) == 0:
163
131
  return True
164
132
  elif self.config:
165
- if self.config.is_interesting:
133
+ if self.config.is_interesting is not None:
134
+ return self.config.is_interesting
135
+ elif self.config.author in INTERESTING_AUTHORS:
166
136
  return True
167
137
  elif self.category() == FINANCE and self.author is not None:
168
138
  return False
169
- elif self.category() in UNINTERESTING_CATEGORES:
139
+ elif self.category() in UNINTERESTING_CATEGORIES:
170
140
  return False
171
141
 
172
142
  for prefix in UNINTERESTING_PREFIXES:
@@ -178,6 +148,10 @@ class OtherFile(Document):
178
148
  def metadata(self) -> Metadata:
179
149
  metadata = super().metadata()
180
150
  metadata['is_interesting'] = self.is_interesting()
151
+
152
+ if self.was_timestamp_extracted:
153
+ metadata['was_timestamp_extracted'] = self.was_timestamp_extracted
154
+
181
155
  return metadata
182
156
 
183
157
  def preview_text(self) -> str:
@@ -191,6 +165,8 @@ class OtherFile(Document):
191
165
  """Return configured timestamp or value extracted by scanning text with datefinder."""
192
166
  if self.config and self.config.timestamp:
193
167
  return self.config.timestamp
168
+ elif self.config and any([s in (self.config_description() or '') for s in SKIP_TIMESTAMP_EXTRACT]):
169
+ return None
194
170
 
195
171
  timestamps: list[datetime] = []
196
172
 
@@ -214,7 +190,10 @@ class OtherFile(Document):
214
190
  self.log_top_lines(15, msg=f"No timestamps found")
215
191
 
216
192
  return None
217
- elif len(timestamps) == 1:
193
+
194
+ self.was_timestamp_extracted = True
195
+
196
+ if len(timestamps) == 1:
218
197
  return timestamps[0]
219
198
  else:
220
199
  timestamps = sorted(uniquify(timestamps), reverse=True)
@@ -222,7 +201,7 @@ class OtherFile(Document):
222
201
  return timestamps[0] # Most recent timestamp appearing in text is usually the closest
223
202
 
224
203
  def _log_extracted_timestamps_info(self, timestamps: list[datetime]) -> None:
225
- num_days_spanned = (timestamps[0] - timestamps[-1]).days
204
+ num_days_spanned = days_between(timestamps[-1], timestamps[0])
226
205
  timestamps_log_msg = f"Extracted {len(timestamps)} timestamps spanning {num_days_spanned} days{TIMESTAMP_LOG_INDENT}"
227
206
  timestamps_log_msg += TIMESTAMP_LOG_INDENT.join([str(dt) for dt in timestamps])
228
207
 
@@ -230,9 +209,9 @@ class OtherFile(Document):
230
209
  self.log_top_lines(15, msg=timestamps_log_msg, level=logging.DEBUG)
231
210
 
232
211
  @staticmethod
233
- def build_table(files: Sequence['OtherFile']) -> Table:
212
+ def files_preview_table(files: Sequence['OtherFile']) -> Table:
234
213
  """Build a table of OtherFile documents."""
235
- table = build_table(None, show_lines=True)
214
+ table = build_table('Other Files Details', show_lines=True)
236
215
  table.add_column('File', justify='center', width=FILENAME_LENGTH)
237
216
  table.add_column('Date', justify='center')
238
217
  table.add_column('Size', justify='center')
@@ -240,7 +219,7 @@ class OtherFile(Document):
240
219
  table.add_column(FIRST_FEW_LINES, justify='left', style='pale_turquoise4')
241
220
 
242
221
  for file in files:
243
- link_and_info = [file.external_links()]
222
+ link_and_info = [file.external_links_txt()]
244
223
  date_str = file.date_str()
245
224
 
246
225
  if file.is_duplicate():
@@ -272,10 +251,10 @@ class OtherFile(Document):
272
251
  logger.warning(f"file {file.file_id} has no category")
273
252
 
274
253
  counts[file.category()] += 1
275
- category_bytes[file.category()] += file.length
254
+ category_bytes[file.category()] += file.file_size()
276
255
 
277
- table = build_table('Other Files Summary')
278
- add_cols_to_table(table, ['Category', 'Count', 'Has Author', 'No Author', 'Size'])
256
+ table = build_table('Other Files Summary', ['Category', 'Count', 'Has Author', 'No Author', 'Size'])
257
+ table.columns[0].min_width = 14
279
258
  table.columns[-1].style = 'dim'
280
259
 
281
260
  for (category, count) in sort_dict(counts):
@@ -23,14 +23,14 @@ from epstein_files.util.constant.strings import *
23
23
  from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
24
24
  epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
25
25
  from epstein_files.util.constants import *
26
- from epstein_files.util.data import dict_sets_to_lists, iso_timestamp, json_safe, listify, sort_dict
26
+ from epstein_files.util.data import days_between, dict_sets_to_lists, json_safe, listify, sort_dict
27
27
  from epstein_files.util.doc_cfg import EmailCfg, Metadata
28
- from epstein_files.util.env import DOCS_DIR, args, logger
28
+ from epstein_files.util.env import DOCS_DIR, args, logger, specified_names
29
29
  from epstein_files.util.file_helper import file_size_str
30
- from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
30
+ from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames, get_info_for_name, get_style_for_name
31
31
  from epstein_files.util.rich import (DEFAULT_NAME_STYLE, LAST_TIMESTAMP_STYLE, NA_TXT, add_cols_to_table,
32
- build_table, console, highlighter, link_text_obj, link_markup, print_author_header, print_centered,
33
- print_other_site_link, print_panel, print_section_header, vertically_pad)
32
+ print_all_files_page_link, build_table, console, highlighter, link_text_obj, link_markup, print_author_header, print_centered,
33
+ print_panel, print_section_header, vertically_pad)
34
34
  from epstein_files.util.search_result import SearchResult
35
35
  from epstein_files.util.timer import Timer
36
36
 
@@ -72,18 +72,18 @@ class EpsteinFiles:
72
72
 
73
73
  # Read through and classify all the files
74
74
  for file_arg in self.all_files:
75
- doc_timer = Timer(decimals=4)
75
+ doc_timer = Timer(decimals=2)
76
76
  document = Document(file_arg)
77
77
  cls = document_cls(document)
78
78
 
79
- if document.length == 0:
79
+ if document.length() == 0:
80
80
  logger.warning(f"Skipping empty file: {document}]")
81
81
  continue
82
82
  elif args.skip_other_files and cls == OtherFile and file_type_count[cls.__name__] > 1:
83
- logger.warning(f"Skipping {document.filename}...")
83
+ document.log(f"Skipping OtherFile...")
84
84
  continue
85
85
 
86
- documents.append(cls(file_arg, text=document.text))
86
+ documents.append(cls(file_arg, lines=document.lines, text=document.text))
87
87
  logger.info(str(documents[-1]))
88
88
  file_type_count[cls.__name__] += 1
89
89
 
@@ -104,16 +104,20 @@ class EpsteinFiles:
104
104
  if PICKLED_PATH.exists() and not args.overwrite_pickle:
105
105
  with gzip.open(PICKLED_PATH, 'rb') as file:
106
106
  epstein_files = pickle.load(file)
107
- timer.print_at_checkpoint(f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})")
108
107
  epstein_files.timer = timer
108
+ timer_msg = f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}'"
109
+ epstein_files.timer.print_at_checkpoint(f"{timer_msg} ({file_size_str(PICKLED_PATH)})")
109
110
  return epstein_files
110
111
 
111
112
  logger.warning(f"Building new cache file, this will take a few minutes...")
112
113
  epstein_files = EpsteinFiles(timer=timer)
113
114
 
114
- with gzip.open(PICKLED_PATH, 'wb') as file:
115
- pickle.dump(epstein_files, file)
116
- logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
115
+ if args.skip_other_files:
116
+ logger.warning(f"Not writing pickled data because --skip-other-files")
117
+ else:
118
+ with gzip.open(PICKLED_PATH, 'wb') as file:
119
+ pickle.dump(epstein_files, file)
120
+ logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
117
121
 
118
122
  timer.print_at_checkpoint(f'Processed {len(epstein_files.all_files):,} documents')
119
123
  return epstein_files
@@ -127,9 +131,6 @@ class EpsteinFiles:
127
131
  names = names if include_useless else [e for e in names if e is None or e.lower() not in EXCLUDED_EMAILERS]
128
132
  return sorted(list(set(names)), key=lambda e: self.email_author_counts[e] + self.email_recipient_counts[e])
129
133
 
130
- def attributed_email_count(self) -> int:
131
- return sum([i for author, i in self.email_author_counts.items() if author != UNKNOWN])
132
-
133
134
  def docs_matching(
134
135
  self,
135
136
  pattern: re.Pattern | str,
@@ -156,7 +157,7 @@ class EpsteinFiles:
156
157
  return self.emails_for(author)[-1].timestamp
157
158
 
158
159
  def email_conversation_length_in_days(self, author: str | None) -> int:
159
- return (self.last_email_at(author) - self.earliest_email_at(author)).days + 1
160
+ return days_between(self.earliest_email_at(author), self.last_email_at(author))
160
161
 
161
162
  def email_signature_substitution_counts(self) -> dict[str, int]:
162
163
  """Return the number of times an email signature was replaced with "<...snipped...>" for each author."""
@@ -172,7 +173,7 @@ class EpsteinFiles:
172
173
  return sorted(list(self.unknown_recipient_email_ids))
173
174
 
174
175
  def emails_by(self, author: str | None) -> list[Email]:
175
- return [e for e in self.emails if e.author == author]
176
+ return Document.sort_by_timestamp([e for e in self.emails if e.author == author])
176
177
 
177
178
  def emails_for(self, author: str | None) -> list[Email]:
178
179
  """Returns emails to or from a given 'author' sorted chronologically."""
@@ -185,9 +186,11 @@ class EpsteinFiles:
185
186
 
186
187
  def emails_to(self, author: str | None) -> list[Email]:
187
188
  if author is None:
188
- return [e for e in self.emails if len(e.recipients) == 0 or None in e.recipients]
189
+ emails = [e for e in self.emails if len(e.recipients) == 0 or None in e.recipients]
189
190
  else:
190
- return [e for e in self.emails if author in e.recipients]
191
+ emails = [e for e in self.emails if author in e.recipients]
192
+
193
+ return Document.sort_by_timestamp(emails)
191
194
 
192
195
  def get_documents_by_id(self, file_ids: str | list[str]) -> list[Document]:
193
196
  file_ids = listify(file_ids)
@@ -204,14 +207,26 @@ class EpsteinFiles:
204
207
  def json_metadata(self) -> str:
205
208
  """Create a JSON string containing metadata for all the files."""
206
209
  metadata = {
207
- Email.__name__: _sorted_metadata(self.emails),
208
- JsonFile.__name__: _sorted_metadata(self.json_files),
209
- MessengerLog.__name__: _sorted_metadata(self.imessage_logs),
210
- OtherFile.__name__: _sorted_metadata(self.non_json_other_files()),
210
+ 'files': {
211
+ Email.__name__: _sorted_metadata(self.emails),
212
+ JsonFile.__name__: _sorted_metadata(self.json_files),
213
+ MessengerLog.__name__: _sorted_metadata(self.imessage_logs),
214
+ OtherFile.__name__: _sorted_metadata(self.non_json_other_files()),
215
+ },
216
+ 'people': {
217
+ name: highlighted_group.get_info(name)
218
+ for highlighted_group in HIGHLIGHTED_NAMES
219
+ if isinstance(highlighted_group, HighlightedNames)
220
+ for name, description in highlighted_group.emailers.items()
221
+ if description
222
+ }
211
223
  }
212
224
 
213
225
  return json.dumps(metadata, indent=4, sort_keys=True)
214
226
 
227
+ def non_duplicate_emails(self) -> list[Email]:
228
+ return [email for email in self.emails if not email.is_duplicate()]
229
+
215
230
  def non_json_other_files(self) -> list[OtherFile]:
216
231
  return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
217
232
 
@@ -230,8 +245,8 @@ class EpsteinFiles:
230
245
  f"{len([d for d in docs if d.is_duplicate()])}",
231
246
  )
232
247
 
233
- add_row('iMessage Logs', self.imessage_logs)
234
248
  add_row('Emails', self.emails)
249
+ add_row('iMessage Logs', self.imessage_logs)
235
250
  add_row('JSON Data', self.json_files)
236
251
  add_row('Other', self.non_json_other_files())
237
252
  console.print(Align.center(table))
@@ -271,12 +286,51 @@ class EpsteinFiles:
271
286
  console.print(Align.center(Email.build_table(emails, author)), '\n')
272
287
 
273
288
  def print_email_device_info(self) -> None:
274
- print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(4, 0, 0, 0), centered=True)
289
+ print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(2, 0, 0, 0), centered=True)
275
290
  console.print(_build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
276
291
  console.print(_build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
277
292
 
278
- def print_emailer_counts_table(self) -> None:
279
- footer = f"Identified authors of {self.attributed_email_count():,} out of {len(self.emails):,} emails ."
293
+ def print_other_files_section(self, files: list[OtherFile]) -> None:
294
+ """Returns the OtherFile objects that were interesting enough to print."""
295
+ category_table = OtherFile.count_by_category_table(files)
296
+ other_files_preview_table = OtherFile.files_preview_table(files)
297
+ header_pfx = '' if args.all_other_files else 'Selected '
298
+ print_section_header(f"{FIRST_FEW_LINES} of {len(files)} {header_pfx}Files That Are Neither Emails Nor Text Messages")
299
+
300
+ if args.all_other_files:
301
+ console.line(1)
302
+ else:
303
+ print_all_files_page_link(self)
304
+ console.line(2)
305
+
306
+ for table in [category_table, other_files_preview_table]:
307
+ table.title = f"{header_pfx}{table.title}"
308
+
309
+ print_centered(category_table)
310
+ console.line(2)
311
+ console.print(other_files_preview_table)
312
+
313
+ def print_text_messages_section(self) -> None:
314
+ """Print summary table and stats for text messages."""
315
+ print_section_header('All of His Text Messages')
316
+ print_centered("(conversations are sorted chronologically based on timestamp of first message)\n", style='gray30')
317
+ authors: list[str | None] = specified_names if specified_names else [JEFFREY_EPSTEIN]
318
+ log_files = self.imessage_logs_for(authors)
319
+
320
+ for log_file in log_files:
321
+ console.print(Padding(log_file))
322
+ console.line(2)
323
+
324
+ print_centered(MessengerLog.summary_table(self.imessage_logs))
325
+ text_summary_msg = f"\nDeanonymized {Document.known_author_count(self.imessage_logs)} of "
326
+ text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
327
+ console.print(text_summary_msg)
328
+ imessage_msg_count = sum([len(log.messages) for log in self.imessage_logs])
329
+ console.print(f"Found {imessage_msg_count} text messages in {len(self.imessage_logs)} iMessage log files.")
330
+
331
+ def table_of_emailers(self) -> Table:
332
+ attributed_emails = [e for e in self.non_duplicate_emails() if e.author]
333
+ footer = f"Identified authors of {len(attributed_emails):,} out of {len(self.non_duplicate_emails()):,} emails."
280
334
  counts_table = build_table("Email Counts", caption=footer)
281
335
 
282
336
  add_cols_to_table(counts_table, [
@@ -308,49 +362,17 @@ class EpsteinFiles:
308
362
  str(self.email_recipient_counts[name]),
309
363
  emails[0].timestamp_without_seconds(),
310
364
  emails[-1].timestamp_without_seconds(),
311
- '' if name is None else link_text_obj(search_jmail_url(name), JMAIL),
312
- '' if not is_ok_for_epstein_web(name) else link_text_obj(epstein_media_person_url(name), 'eMedia'),
313
- '' if not is_ok_for_epstein_web(name) else link_text_obj(epstein_web_person_url(name), 'eWeb'),
314
- '' if name is None else link_text_obj(search_twitter_url(name), 'search X'),
365
+ link_text_obj(search_jmail_url(name), JMAIL) if name else '',
366
+ link_text_obj(epstein_media_person_url(name), 'eMedia') if is_ok_for_epstein_web(name) else '',
367
+ link_text_obj(epstein_web_person_url(name), 'eWeb') if is_ok_for_epstein_web(name) else '',
368
+ link_text_obj(search_twitter_url(name), 'search X') if name else '',
315
369
  )
316
370
 
317
- console.print(vertically_pad(counts_table, 2))
318
-
319
- def print_imessage_summary(self) -> None:
320
- """Print summary table and stats for text messages."""
321
- console.print(MessengerLog.summary_table(self.imessage_logs))
322
- text_summary_msg = f"\nDeanonymized {Document.known_author_count(self.imessage_logs)} of "
323
- text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
324
- console.print(text_summary_msg)
325
- imessage_msg_count = sum([len(log.messages) for log in self.imessage_logs])
326
- console.print(f"Found {imessage_msg_count} text messages in {len(self.imessage_logs)} iMessage log files.")
327
-
328
- def print_other_files_table(self) -> list[OtherFile]:
329
- """Returns the OtherFile objects that were interesting enough to print."""
330
- interesting_files = [doc for doc in self.other_files if args.all_other_files or doc.is_interesting()]
331
- header_pfx = '' if args.all_other_files else 'Selected '
332
- print_section_header(f"{FIRST_FEW_LINES} of {len(interesting_files)} {header_pfx}Files That Are Neither Emails Nor Text Msgs")
333
-
334
- if not args.all_other_files:
335
- print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and {len(self.emails):,} emails)", style='dim')
336
- print_other_site_link(False)
337
- console.line(2)
338
-
339
- console.print(OtherFile.build_table(interesting_files))
340
- console.print(Padding(OtherFile.count_by_category_table(interesting_files), (2, 0, 2, 2)))
341
- skipped_file_count = len(self.other_files) - len(interesting_files)
342
-
343
- if skipped_file_count > 0:
344
- logger.warning(f"Skipped {skipped_file_count} uninteresting other files...")
345
-
346
- return interesting_files
371
+ return counts_table
347
372
 
348
373
  def _tally_email_data(self) -> None:
349
374
  """Tally up summary info about Email objects."""
350
- for email in self.emails:
351
- if email.is_duplicate():
352
- continue
353
-
375
+ for email in self.non_duplicate_emails():
354
376
  self.email_author_counts[email.author] += 1
355
377
 
356
378
  if len(email.recipients) == 0:
@@ -380,7 +402,7 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
380
402
  def document_cls(doc: Document) -> Type[Document]:
381
403
  search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
382
404
 
383
- if doc.length == 0:
405
+ if doc.length() == 0:
384
406
  return Document
385
407
  if doc.text[0] == '{':
386
408
  return JsonFile
@@ -238,7 +238,7 @@ OTHER_NAMES = NAMES_TO_NOT_HIGHLIGHT + """
238
238
  ian isaac isaacson
239
239
  james jamie jane janet jason jen jim joe johnson jones josh julie justin
240
240
  karl kate kathy kelly kim kruger kyle
241
- laurie leo leonard lenny leslie lieberman louis lynch lynn
241
+ laurie lawrence leo leonard lenny leslie lieberman louis lynch lynn
242
242
  marcus marianne matt matthew melissa michele michelle moore moscowitz
243
243
  nancy nicole nussbaum
244
244
  owen
@@ -22,7 +22,6 @@ PUBLICIST = 'publicist'
22
22
  REPUTATION = 'reputation'
23
23
  SKYPE_LOG = 'Skype log'
24
24
  SOCIAL = 'social'
25
- SPEECH = 'speech'
26
25
 
27
26
  # Locations
28
27
  PALM_BEACH = 'Palm Beach'
@@ -35,6 +34,7 @@ CHINA_DAILY = "China Daily"
35
34
  DAILY_MAIL = 'Daily Mail'
36
35
  DAILY_TELEGRAPH = "Daily Telegraph"
37
36
  LA_TIMES = 'LA Times'
37
+ LEXIS_NEXIS = 'Lexis Nexis'
38
38
  MIAMI_HERALD = 'Miami Herald'
39
39
  NYT = "New York Times"
40
40
  PALM_BEACH_DAILY_NEWS = f'{PALM_BEACH} Daily News'