PyPI - epstein-files - Versions diffs - 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

epstein-files 1.2.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

epstein_files/__init__.py +11 -24
epstein_files/documents/communication.py +0 -3
epstein_files/documents/document.py +61 -18
epstein_files/documents/email.py +11 -5
epstein_files/documents/emails/email_header.py +10 -2
epstein_files/documents/imessage/text_message.py +3 -2
epstein_files/documents/other_file.py +16 -34
epstein_files/epstein_files.py +23 -33
epstein_files/person.py +39 -65
epstein_files/util/constant/names.py +13 -6
epstein_files/util/constant/strings.py +0 -1
epstein_files/util/constant/urls.py +1 -0
epstein_files/util/constants.py +3 -1
epstein_files/util/data.py +1 -1
epstein_files/util/doc_cfg.py +3 -3
epstein_files/util/env.py +4 -4
epstein_files/util/highlighted_group.py +112 -94
epstein_files/util/logging.py +1 -1
epstein_files/util/output.py +36 -12
epstein_files/util/rich.py +14 -14
epstein_files/util/timer.py +14 -0
{epstein_files-1.2.0.dist-info → epstein_files-1.2.1.dist-info}/METADATA +5 -2
epstein_files-1.2.1.dist-info/RECORD +34 -0
epstein_files-1.2.0.dist-info/RECORD +0 -34
{epstein_files-1.2.0.dist-info → epstein_files-1.2.1.dist-info}/LICENSE +0 -0
{epstein_files-1.2.0.dist-info → epstein_files-1.2.1.dist-info}/WHEEL +0 -0
{epstein_files-1.2.0.dist-info → epstein_files-1.2.1.dist-info}/entry_points.txt +0 -0

epstein_files/__init__.py CHANGED Viewed

@@ -21,7 +21,7 @@ from epstein_files.util.env import args
 from epstein_files.util.file_helper import coerce_file_path, extract_file_id
 from epstein_files.util.logging import exit_with_error, logger
 from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
-     print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info_png,
+     print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
      print_json_metadata, write_urls)
 from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
      print_title_page_tables, print_subtitle_panel, write_html)
@@ -38,15 +38,15 @@ def generate_html() -> None:
     timer = Timer()
     epstein_files = EpsteinFiles.get_files(timer)
-    if args.json_metadata:
+    if args.emailers_info:
+        print_emailers_info(epstein_files)
+        exit()
+    elif args.json_metadata:
         print_json_metadata(epstein_files)
         exit()
     elif args.json_files:
         print_json_files(epstein_files)
         exit()
-    elif args.emailers_info_png:
-        print_emailers_info_png(epstein_files)
-        exit()
     print_title_page_header()
@@ -59,25 +59,19 @@ def generate_html() -> None:
         exit()
     if args.output_texts:
-        imessage_logs = [log for log in epstein_files.imessage_logs if not args.names or log.author in args.names]
-        print_text_messages_section(imessage_logs)
-        timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
+        printed_logs = print_text_messages_section(epstein_files)
+        timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
     if args.output_emails:
-        emails_that_were_printed = print_emails_section(epstein_files)
-        timer.print_at_checkpoint(f"Printed {len(emails_that_were_printed):,} emails")
+        printed_emails = print_emails_section(epstein_files)
+        timer.log_section_complete('Email', epstein_files.emails, printed_emails)
     elif args.email_timeline:
         print_email_timeline(epstein_files)
         timer.print_at_checkpoint(f"Printed chronological emails table")
     if args.output_other:
-        if args.uninteresting:
-            files = [f for f in epstein_files.other_files if not f.is_interesting()]
-        else:
-            files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
-        print_other_files_section(files, epstein_files)
-        timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
+        printed_files = print_other_files_section(epstein_files)
+        timer.log_section_complete('OtherFile', epstein_files.other_files, printed_files)
     write_html(args.build)
     logger.warning(f"Total time: {timer.seconds_since_start_str()}")
@@ -94,7 +88,6 @@ def epstein_diff():
 def epstein_search():
     """Search the cleaned up text of the files."""
-    _assert_positional_args()
     epstein_files = EpsteinFiles.get_files()
     for search_term in args.positional_args:
@@ -117,7 +110,6 @@ def epstein_search():
 def epstein_show():
     """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
-    _assert_positional_args()
     raw_docs: list[Document] = []
     console.line()
@@ -142,8 +134,3 @@ def epstein_show():
 def epstein_word_count() -> None:
     write_word_counts_html()
-def _assert_positional_args():
-    if not args.positional_args:
-        exit_with_error(f"No positional args provided!\n")

epstein_files/documents/communication.py CHANGED Viewed

@@ -34,9 +34,6 @@ class Communication(Document):
         """Overrides super() method to apply self.author_style."""
         return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
-    def is_attribution_uncertain(self) -> bool:
-        return bool(self.config and self.config.is_attribution_uncertain)
     def summary(self) -> Text:
         return self._summary().append(CLOSE_PROPERTIES_CHAR)

epstein_files/documents/document.py CHANGED Viewed

@@ -11,17 +11,19 @@ from rich.console import Console, ConsoleOptions, Group, RenderResult
 from rich.padding import Padding
 from rich.panel import Panel
 from rich.text import Text
+from rich.table import Table
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constant.urls import *
 from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
-from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time_from_timestamp_str, without_falsey
+from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
 from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
 from epstein_files.util.env import DOCS_DIR, args
-from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
+from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, file_size_to_str, is_local_extract_file
 from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
-from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize
+from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table, console,
+     highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
 from epstein_files.util.search_result import MatchedLine
 ALT_LINK_STYLE = 'white dim'
@@ -55,6 +57,14 @@ OCR_REPAIRS = {
     'Nil Priell': 'Nili Priell',
 }
+SUMMARY_TABLE_COLS: list[str | dict] = [
+    'Count',
+    {'name': 'Has Author', 'style': 'honeydew2'},
+    {'name': 'No Author', 'style': 'wheat4'},
+    {'name': 'Uncertain Author', 'style': 'royal_blue1 dim'},
+    {'name': 'Size', 'justify': 'right', 'style': 'dim'},
+]
 @dataclass
 class Document:
@@ -181,6 +191,9 @@ class Document:
         """Secondary info about this file (description recipients, etc). Overload in subclasses."""
         return None
+    def is_attribution_uncertain(self) -> bool:
+        return bool(self.config and self.config.is_attribution_uncertain)
     def is_duplicate(self) -> bool:
         return bool(self.duplicate_of_id())
@@ -240,17 +253,6 @@ class Document:
         return text
-    def sort_key(self) -> tuple[datetime, str, int]:
-        """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
-        if self.is_duplicate():
-            sort_id = self.config.duplicate_of_id
-            dupe_idx = 1
-        else:
-            sort_id = self.file_id
-            dupe_idx = 0
-        return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
     def source_file_id(self) -> str:
         """Strip off the _1, _2, etc. suffixes for extracted documents."""
         return self.file_id[0:6]
@@ -261,7 +263,7 @@ class Document:
         txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
         if self.timestamp:
-            timestamp_str = remove_zero_time_from_timestamp_str(self.timestamp).replace('T', ' ')
+            timestamp_str = remove_zero_time(self.timestamp).replace('T', ' ')
             txt.append(' (', style=SYMBOL_STYLE)
             txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
@@ -282,6 +284,17 @@ class Document:
         return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
+    def timestamp_sort_key(self) -> tuple[datetime, str, int]:
+        """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
+        if self.is_duplicate():
+            sort_id = self.config.duplicate_of_id
+            dupe_idx = 1
+        else:
+            sort_id = self.file_id
+            dupe_idx = 0
+        return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
     def top_lines(self, n: int = 10) -> str:
         """First n lines."""
         return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
@@ -361,6 +374,32 @@ class Document:
     def __str__(self) -> str:
         return self.summary().plain
+    @classmethod
+    def file_info_table(cls, title: str, first_col_name: str) -> Table:
+        """Empty table with appropriate cols for summarizing groups of files."""
+        table = build_table(title)
+        cols = [{'name': first_col_name, 'min_width': 14}] + SUMMARY_TABLE_COLS
+        add_cols_to_table(table, cols, 'right')
+        return table
+    @classmethod
+    def files_info(cls, files: Sequence['Document'], is_author_na: bool = False) -> dict[str, str | Text]:
+        """Summary info about a group of files."""
+        file_count = len(files)
+        author_count = cls.known_author_count(files)
+        return {
+            'count': str(file_count),
+            'author_count': NA_TXT if is_author_na else str(author_count),
+            'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
+            'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain()])),
+            'bytes': file_size_to_str(sum([f.file_size() for f in files])),
+        }
+    @classmethod
+    def files_info_row(cls, files: Sequence['Document'], author_na: bool = False) -> Sequence[str | Text]:
+        return [v for v in cls.files_info(files, author_na).values()]
     @staticmethod
     def diff_files(files: list[str]) -> None:
         """Diff the contents of two Documents after all cleanup, BOM removal, etc."""
@@ -398,14 +437,18 @@ class Document:
     @staticmethod
     def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
-        return sorted(docs, key=lambda doc: doc.sort_key())
+        return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
-    @classmethod
-    def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
+    @staticmethod
+    def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
         """Uniquify by file_id."""
         id_map = {doc.file_id: doc for doc in documents}
         return [doc for doc in id_map.values()]
+    @staticmethod
+    def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
+        return [doc for doc in docs if not doc.is_duplicate()]
 DocumentType = TypeVar('DocumentType', bound=Document)

epstein_files/documents/email.py CHANGED Viewed

@@ -32,7 +32,7 @@ BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE
 BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
 DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
 LINK_LINE_REGEX = re.compile(f"^(> )?htt")
-QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
+QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
 REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
 BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
@@ -114,7 +114,7 @@ EMAIL_SIGNATURE_REGEXES = {
     DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
     DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
     JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
-    JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*", re.IGNORECASE),
+    JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
     KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
     LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
     LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
@@ -436,9 +436,9 @@ class Email(Communication):
         elif self.header.num_header_rows == 0:
             return self.text
-        reply_text_match = REPLY_TEXT_REGEX.search(text)
         self.log_top_lines(20, "Raw text:", logging.DEBUG)
         self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
+        reply_text_match = REPLY_TEXT_REGEX.search(text)
         if reply_text_match:
             actual_num_chars = len(reply_text_match.group(1))
@@ -550,9 +550,15 @@ class Email(Communication):
     def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
         """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
-        for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
+        if text is None:
+            header_offset = len(self.header.header_chars)
+            text = self.text[header_offset:]
+        else:
+            header_offset = 0
+        for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
             if i >= n:
-                return match.end() - 1
+                return match.end() + header_offset - 1
     def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
         """Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""

epstein_files/documents/emails/email_header.py CHANGED Viewed

@@ -9,7 +9,6 @@ from epstein_files.util.logging import logger
 from epstein_files.util.rich import UNKNOWN
 FIELD_NAMES = ['Date', 'From', 'Sent', 'Subject']
-NON_HEADER_FIELDS = ['field_names', 'num_header_rows', 'was_initially_empty']
 ON_BEHALF_OF = 'on behalf of'
 TO_FIELDS = ['bcc', 'cc', 'to']
 EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
@@ -28,10 +27,18 @@ CONFIGURED_ACTUAL_TEXTS = [
     if isinstance(cfg, EmailCfg) and cfg.actual_text is not None
 ]
+NON_HEADER_FIELDS = [
+    'field_names',
+    'header_chars',
+    'num_header_rows',
+    'was_initially_empty',
+]
 @dataclass(kw_only=True)
 class EmailHeader:
     field_names: list[str]  # Order is same as the order header fields appear in the email file text
+    header_chars: str = ''
     num_header_rows: int = field(init=False)
     was_initially_empty: bool = False
@@ -101,6 +108,7 @@ class EmailHeader:
             setattr(self, field_name, value)
         self.num_header_rows = len(self.field_names) + num_headers
+        self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
         log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
         logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
@@ -163,7 +171,7 @@ class EmailHeader:
         if should_log_header:
             logger.debug(f"Header being parsed was this:\n\n{header}\n")
-        return EmailHeader(field_names=field_names, **kw_args)
+        return cls(field_names=field_names, header_chars=header, **kw_args)
     @staticmethod
     def cleanup_str(_str: str) -> str:

epstein_files/documents/imessage/text_message.py CHANGED Viewed

@@ -4,7 +4,7 @@ from datetime import datetime
 from rich.text import Text
-from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN, Name, extract_last_name
+from epstein_files.util.constant.names import ANTHONY_SCARAMUCCI, JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN, Name, extract_last_name
 from epstein_files.util.constant.strings import TIMESTAMP_DIM
 from epstein_files.util.data import iso_timestamp
 from epstein_files.util.highlighted_group import get_style_for_name
@@ -17,6 +17,7 @@ PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
 UNCERTAIN_SUFFIX = ' (?)'
 DISPLAY_LAST_NAME_ONLY = [
+    ANTHONY_SCARAMUCCI,
     JEFFREY_EPSTEIN,
     STEVE_BANNON,
 ]
@@ -59,7 +60,7 @@ class TextMessage:
         try:
             timestamp_str = iso_timestamp(self.parse_timestamp())
         except Exception as e:
-            logger.warning(f"Failed to parse timestamp for {self}")
+            logger.info(f"Failed to parse timestamp for {self}")
             timestamp_str = self.timestamp_str
         return Text(f"[{timestamp_str}]", style=TIMESTAMP_DIM)

epstein_files/documents/other_file.py CHANGED Viewed

@@ -22,7 +22,7 @@ from epstein_files.util.data import days_between, escape_single_quotes, remove_t
 from epstein_files.util.file_helper import FILENAME_LENGTH, file_size_to_str
 from epstein_files.util.env import args
 from epstein_files.util.highlighted_group import QUESTION_MARKS_TXT, styled_category
-from epstein_files.util.rich import build_table, highlighter
+from epstein_files.util.rich import add_cols_to_table, build_table, highlighter
 from epstein_files.util.logging import logger
 FIRST_FEW_LINES = 'First Few Lines'
@@ -209,39 +209,8 @@ class OtherFile(Document):
         if num_days_spanned > MAX_DAYS_SPANNED_TO_BE_VALID and VAST_HOUSE not in self.text:
             self.log_top_lines(15, msg=timestamps_log_msg, level=logging.DEBUG)
-    @staticmethod
-    def count_by_category_table(files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
-        counts = defaultdict(int)
-        category_bytes = defaultdict(int)
-        for file in files:
-            if file.category() is None:
-                logger.warning(f"file {file.file_id} has no category")
-            counts[file.category()] += 1
-            category_bytes[file.category()] += file.file_size()
-        table = build_table(f'{title_pfx}Other Files Summary', ['Category', 'Count', 'Has Author', 'No Author', 'Size'])
-        table.columns[-1].justify = 'right'
-        table.columns[0].min_width = 14
-        table.columns[-1].style = 'dim'
-        for (category, count) in sort_dict(counts):
-            category_files = [f for f in files if f.category() == category]
-            known_author_count = Document.known_author_count(category_files)
-            table.add_row(
-                styled_category(category),
-                str(count),
-                str(known_author_count),
-                str(count - known_author_count),
-                file_size_to_str(category_bytes[category]),
-            )
-        return table
-    @staticmethod
-    def files_preview_table(files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
+    @classmethod
+    def files_preview_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
         """Build a table of OtherFile documents."""
         table = build_table(f'{title_pfx}Other Files Details in Chronological Order', show_lines=True)
         table.add_column('File', justify='center', width=FILENAME_LENGTH)
@@ -272,3 +241,16 @@ class OtherFile(Document):
             )
         return table
+    @classmethod
+    def summary_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
+        categories = uniquify([f.category() for f in files])
+        categories = sorted(categories, key=lambda c: -len([f for f in files if f.category() == c]))
+        table = cls.file_info_table(f'{title_pfx}Other Files Summary', 'Category')
+        for category in categories:
+            category_files = [f for f in files if f.category() == category]
+            table.add_row(styled_category(category), *cls.files_info_row(category_files))
+        table.columns = table.columns[:-2] + [table.columns[-1]]  # Removee unknown author col
+        return table

epstein_files/epstein_files.py CHANGED Viewed

@@ -9,6 +9,8 @@ from datetime import datetime
 from pathlib import Path
 from typing import Sequence, Type, cast
+from rich.table import Table
 from epstein_files.documents.document import Document
 from epstein_files.documents.email import DETECT_EMAIL_REGEX, Email
 from epstein_files.documents.json_file import JsonFile
@@ -22,7 +24,6 @@ from epstein_files.util.doc_cfg import EmailCfg, Metadata
 from epstein_files.util.env import DOCS_DIR, args, logger
 from epstein_files.util.file_helper import file_size_str
 from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames
-from epstein_files.util.rich import NA_TXT, add_cols_to_table, build_table, console, print_centered
 from epstein_files.util.search_result import SearchResult
 from epstein_files.util.timer import Timer
@@ -31,9 +32,13 @@ PICKLED_PATH = Path("the_epstein_files.pkl.gz")
 SLOW_FILE_SECONDS = 1.0
 EMAILS_WITH_UNINTERESTING_CCS = [
-    '025329',  # Krassner
-    '024923',  # Krassner
-    '033568',  # Krassner
+    '025329',    # Krassner
+    '024923',    # Krassner
+    '033568',    # Krassner
+]
+EMAILS_WITH_UNINTERESTING_BCCS = [
+    '014797_1',  # Ross Gow
 ]
@@ -45,7 +50,7 @@ class EpsteinFiles:
     json_files: list[JsonFile] = field(default_factory=list)
     other_files: list[OtherFile] = field(default_factory=list)
     timer: Timer = field(default_factory=lambda: Timer())
-    uninteresting_ccs: list[Name] = field(init=False)
+    uninteresting_ccs: list[Name] = field(default_factory=list)
     def __post_init__(self):
         """Iterate through files and build appropriate objects."""
@@ -88,13 +93,12 @@ class EpsteinFiles:
         if PICKLED_PATH.exists() and not args.overwrite_pickle and not args.skip_other_files:
             with gzip.open(PICKLED_PATH, 'rb') as file:
                 epstein_files = pickle.load(file)
-                epstein_files.timer = timer
                 timer_msg = f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}'"
-                epstein_files.timer.print_at_checkpoint(f"{timer_msg} ({file_size_str(PICKLED_PATH)})")
+                timer.print_at_checkpoint(f"{timer_msg} ({file_size_str(PICKLED_PATH)})")
                 return epstein_files
         logger.warning(f"Building new cache file, this will take a few minutes...")
-        epstein_files = EpsteinFiles(timer=timer)
+        epstein_files = EpsteinFiles()
         if args.skip_other_files:
             logger.warning(f"Not writing pickled data because --skip-other-files")
@@ -235,7 +239,7 @@ class EpsteinFiles:
         return json.dumps(metadata, indent=4, sort_keys=True)
     def non_duplicate_emails(self) -> list[Email]:
-        return [email for email in self.emails if not email.is_duplicate()]
+        return Document.without_dupes(self.emails)
     def non_json_other_files(self) -> list[OtherFile]:
         return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
@@ -253,34 +257,20 @@ class EpsteinFiles:
             for name in names
         ]
-    def print_files_summary(self) -> None:
-        table = build_table('File Overview')
-        add_cols_to_table(table, ['File Type', 'Count', 'Author Known', 'Author Unknown', 'Duplicates'])
-        table.columns[1].justify = 'right'
-        def add_row(label: str, docs: list):
-            known = None if isinstance(docs[0], JsonFile) else Document.known_author_count(docs)
-            table.add_row(
-                label,
-                f"{len(docs):,}",
-                f"{known:,}" if known is not None else NA_TXT,
-                f"{len(docs) - known:,}" if known is not None else NA_TXT,
-                f"{len([d for d in docs if d.is_duplicate()])}",
-            )
-        add_row('Emails', self.emails)
-        add_row('iMessage Logs', self.imessage_logs)
-        add_row('JSON Data', self.json_files)
-        add_row('Other', self.non_json_other_files())
-        print_centered(table)
-        console.line()
+    def overview_table(self) -> Table:
+        table = Document.file_info_table('Files Overview', 'File Type')
+        table.add_row('Emails', *Document.files_info_row(self.emails))
+        table.add_row('iMessage Logs', *Document.files_info_row(self.imessage_logs))
+        table.add_row('JSON Data', *Document.files_info_row(self.json_files, True))
+        table.add_row('Other', *Document.files_info_row(self.non_json_other_files()))
+        return table
     def unknown_recipient_ids(self) -> list[str]:
         """IDs of emails whose recipient is not known."""
         return sorted([e.file_id for e in self.emails if None in e.recipients or not e.recipients])
     def uninteresting_emailers(self) -> list[Name]:
+        """Emailers whom we don't want to print a separate section for because they're just CCed."""
         if '_uninteresting_emailers' not in vars(self):
             self._uninteresting_emailers = sorted(uniquify(UNINTERESTING_EMAILERS + self.uninteresting_ccs))
@@ -306,8 +296,8 @@ class EpsteinFiles:
         self.emails = Document.sort_by_timestamp(self.emails)
     def _set_uninteresting_ccs(self) -> None:
-        ross_gow_email = self.email_for_id('014797_1')
-        self.uninteresting_ccs = copy(cast(list[Name], ross_gow_email.header.bcc))
+        for id in EMAILS_WITH_UNINTERESTING_BCCS:
+            self.uninteresting_ccs += copy(cast(list[Name], self.email_for_id(id).header.bcc))
         for id in EMAILS_WITH_UNINTERESTING_CCS:
             self.uninteresting_ccs += self.email_for_id(id).recipients

epstein-files 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

epstein-files 1.2.0py3-none-any.whl → 1.2.1py3-none-any.whl