PyPI - epstein-files - Versions diffs - 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl - Mend

epstein-files 1.0.4py3-none-any.whl → 1.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

epstein_files/__init__.py +24 -25
epstein_files/count_words.py +72 -0
epstein_files/documents/document.py +1 -2
epstein_files/documents/email.py +15 -10
epstein_files/documents/json_file.py +4 -4
epstein_files/documents/messenger_log.py +2 -1
epstein_files/documents/other_file.py +2 -2
epstein_files/epstein_files.py +40 -40
epstein_files/util/constant/output_files.py +20 -4
epstein_files/util/constant/strings.py +8 -8
epstein_files/util/constant/urls.py +6 -21
epstein_files/util/constants.py +19 -18
epstein_files/util/doc_cfg.py +3 -1
epstein_files/util/env.py +35 -30
epstein_files/util/highlighted_group.py +4 -3
epstein_files/util/output.py +29 -16
epstein_files/util/rich.py +56 -28
epstein_files/util/word_count.py +10 -10
{epstein_files-1.0.4.dist-info → epstein_files-1.0.6.dist-info}/METADATA +37 -18
epstein_files-1.0.6.dist-info/RECORD +34 -0
{epstein_files-1.0.4.dist-info → epstein_files-1.0.6.dist-info}/entry_points.txt +1 -1
epstein_files-1.0.4.dist-info/RECORD +0 -33
{epstein_files-1.0.4.dist-info → epstein_files-1.0.6.dist-info}/LICENSE +0 -0
{epstein_files-1.0.4.dist-info → epstein_files-1.0.6.dist-info}/WHEEL +0 -0

epstein_files/__init__.py CHANGED Viewed

@@ -10,11 +10,12 @@ from sys import exit
 from dotenv import load_dotenv
 load_dotenv()
 from rich.markup import escape
 from rich.padding import Padding
 from rich.panel import Panel
+from rich.text import Text
+from epstein_files.count_words import write_word_counts_html
 from epstein_files.epstein_files import EpsteinFiles, document_cls
 from epstein_files.documents.document import INFO_PADDING, Document
 from epstein_files.documents.email import Email
@@ -24,22 +25,25 @@ from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_
 from epstein_files.util.env import args, specified_names
 from epstein_files.util.file_helper import coerce_file_path, extract_file_id
 from epstein_files.util.logging import logger
-from epstein_files.util.output import print_emails, print_json_metadata, print_json_stats, print_text_messages, write_urls
+from epstein_files.util.output import print_emails, print_json_files, print_json_metadata, print_json_stats, print_text_messages, write_urls
 from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
 from epstein_files.util.timer import Timer
+timer = Timer()
+epstein_files = EpsteinFiles.get_files(timer)
 def generate_html() -> None:
     if args.make_clean:
         make_clean()
+        write_urls()
         exit()
-    timer = Timer()
-    epstein_files = EpsteinFiles.get_files(timer)
-    if args.json_metadata:
+    elif args.json_metadata:
         print_json_metadata(epstein_files)
         exit()
+    elif args.output_json_files:
+        print_json_files(epstein_files)
+        exit()
     print_header(epstein_files)
@@ -75,7 +79,7 @@ def epstein_diff():
 def epstein_search():
     """Search the cleaned up text of the files."""
     _assert_positional_args()
-    epstein_files = EpsteinFiles.get_files(use_pickled=True)
+    epstein_files = EpsteinFiles.get_files()
     for search_term in args.positional_args:
         temp_highlighter = build_highlighter(search_term)
@@ -103,32 +107,27 @@ def epstein_show():
     """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
     _assert_positional_args()
     ids = [extract_file_id(arg) for arg in args.positional_args]
+    raw_docs = [Document(coerce_file_path(id)) for id in ids]
+    docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
     console.line()
-    if args.pickled:
-        epstein_files = EpsteinFiles.get_files(use_pickled=True)
-        docs = epstein_files.get_documents_by_id(ids)
-    else:
-        raw_docs = [Document(coerce_file_path(id)) for id in ids]
-        docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
     for doc in docs:
-        console.line()
-        console.print(doc)
+        if isinstance(doc, Email):
+            doc.truncation_allowed = False
+        console.print('\n', doc, '\n')
         if args.raw:
-            console.line()
-            console.print(Panel(f"*** {doc.url_slug} RAW ***", expand=False, style=doc._border_style()))
-            console.print(escape(doc.raw_text()))
+            console.print(Panel(Text("RAW: ").append(doc.summary()), expand=False, style=doc._border_style()))
+            console.print(escape(doc.raw_text()), '\n')
             if isinstance(doc, Email):
-                console.line()
-                console.print(Panel(f"*** {doc.url_slug} actual_text ***", expand=False, style=doc._border_style()))
-                console.print(escape(doc._actual_text()))
+                console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc._border_style()))
+                console.print(escape(doc._actual_text()), '\n')
-def epstein_dump_urls() -> None:
-    write_urls()
+def epstein_word_count() -> None:
+    write_word_counts_html()
 def _assert_positional_args():

epstein_files/count_words.py ADDED Viewed

@@ -0,0 +1,72 @@
+# Count word usage in emails and texts
+import re
+from epstein_files.epstein_files import EpsteinFiles
+from epstein_files.util.constant.common_words import COMMON_WORDS_LIST
+from epstein_files.util.constant.output_files import WORD_COUNT_HTML_PATH
+from epstein_files.util.env import args, specified_names
+from epstein_files.util.logging import logger
+from epstein_files.util.rich import (console, print_centered, print_color_key, print_page_title, print_panel,
+     print_starred_header, write_html)
+from epstein_files.util.search_result import MatchedLine, SearchResult
+from epstein_files.util.timer import Timer
+from epstein_files.util.word_count import WordCount
+HTML_REGEX = re.compile(r"^http|#yiv")
+def write_word_counts_html() -> None:
+    timer = Timer()
+    epstein_files = EpsteinFiles.get_files(timer)
+    email_subjects: set[str] = set()
+    word_count = WordCount()
+    # Remove dupes, junk mail, and fwded articles from emails
+    emails = [
+        e for e in epstein_files.emails
+        if not (e.is_duplicate or e.is_junk_mail() or (e.config and e.config.is_fwded_article)) \
+            and (len(specified_names) == 0 or e.author in specified_names)
+    ]
+    for email in emails:
+        logger.info(f"Counting words in {email}\n  [SUBJECT] {email.subject()}")
+        lines = email.actual_text.split('\n')
+        if email.subject() not in email_subjects and f'Re: {email.subject()}' not in email_subjects:
+            email_subjects.add(email.subject())
+            lines.append(email.subject())
+        for i, line in enumerate(lines):
+            if HTML_REGEX.search(line):
+                continue
+            for word in line.split():
+                word_count.tally_word(word, SearchResult(email, [MatchedLine(line, i)]))
+    # Add in iMessage conversation words
+    imessage_logs = epstein_files.imessage_logs_for(specified_names) if specified_names else epstein_files.imessage_logs
+    for imessage_log in imessage_logs:
+        logger.info(f"Counting words in {imessage_log}")
+        for msg in imessage_log.messages():
+            if len(specified_names) > 0 and msg.author not in specified_names:
+                continue
+            elif HTML_REGEX.search(line):
+                continue
+            for word in msg.text.split():
+                word_count.tally_word(word, SearchResult(imessage_log, [msg.text]))
+    print_page_title(expand=False)
+    print_starred_header(f"Most Common Words in {len(emails):,} Emails and {len(imessage_logs)} iMessage Logs")
+    print_centered(f"(excluding {len(COMMON_WORDS_LIST)} particularly common words at bottom)", style='dim')
+    console.line()
+    print_color_key()
+    console.line()
+    console.print(word_count)
+    console.line(2)
+    print_panel(f"{len(COMMON_WORDS_LIST):,} Excluded Words", centered=True)
+    console.print(', '.join(COMMON_WORDS_LIST), highlight=False)
+    write_html(WORD_COUNT_HTML_PATH)
+    timer.print_at_checkpoint(f"Finished counting words")

epstein_files/documents/document.py CHANGED Viewed

@@ -85,10 +85,9 @@ class Document:
         if self.is_local_extract_file():
             self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
-            cfg_type = type(self.config).__name__ if self.config else None
             # Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
-            if self.class_name() == EMAIL_CLASS and self.config and cfg_type != EmailCfg.__name__:
+            if self.class_name() == EMAIL_CLASS and self.config and not isinstance(self.config, EmailCfg):
                 self.config = EmailCfg.from_doc_cfg(self.config)
         else:
             self.url_slug = self.file_path.stem

epstein_files/documents/email.py CHANGED Viewed

@@ -26,7 +26,7 @@ from epstein_files.util.logging import logger
 from epstein_files.util.rich import *
 BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
-BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
+BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
 DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
 LINK_LINE_REGEX = re.compile(f"^(> )?htt")
 QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
@@ -245,12 +245,10 @@ TRUNCATE_TERMS = [
 ]
 # Some Paul Krassner emails have a ton of CCed parties we don't care about
-KRASSNER_RECIPIENTS = uniquify(flatten(ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']))
+KRASSNER_RECIPIENTS = uniquify(flatten([ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']]))
 # No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
-USELESS_EMAILERS = IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS + \
-                   KRASSNER_RECIPIENTS + \
-                   FLIGHT_IN_2012_PEOPLE + [
+USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIPIENTS + [
     'Alan Rogers',                           # Random CC
     'Andrew Friendly',                       # Presumably some relation of Kelly Friendly
     'BS Stern',                              # A random fwd of email we have
@@ -322,11 +320,18 @@ class Email(Communication):
     def __post_init__(self):
         super().__post_init__()
-        if self.config and self.config.recipients:
-            self.recipients = cast(list[str | None], self.config.recipients)
-        else:
-            for recipient in self.header.recipients():
-                self.recipients.extend(self._get_names(recipient))
+        try:
+            if self.config and self.config.recipients:
+                self.recipients = cast(list[str | None], self.config.recipients)
+            else:
+                for recipient in self.header.recipients():
+                    self.recipients.extend(self._get_names(recipient))
+        except Exception as e:
+            console.print_exception()
+            console.line(2)
+            logger.fatal(f"Failed on {self.file_id}")
+            console.line(2)
+            raise e
         # Remove self CCs
         recipients = [r for r in self.recipients if r != self.author or self.file_id in SELF_EMAILS_FILE_IDS]

epstein_files/documents/json_file.py CHANGED Viewed

@@ -21,14 +21,11 @@ class JsonFile(OtherFile):
         if self.url_slug.endswith('.txt') or self.url_slug.endswith('.json'):
             self.url_slug = Path(self.url_slug).stem
-        self._set_computed_fields(text=self.formatted_json())
+        self._set_computed_fields(text=self.json_str())
     def category(self) -> str:
         return JSON
-    def formatted_json(self) -> str:
-        return json.dumps(self.json_data(), indent=4)
     def info_txt(self) -> Text | None:
         return Text(f"JSON file, possibly iMessage or similar app metadata", style='white dim italic')
@@ -38,3 +35,6 @@ class JsonFile(OtherFile):
     def json_data(self) -> object:
         with open(self.file_path, encoding='utf-8-sig') as f:
             return json.load(f)
+    def json_str(self) -> str:
+        return json.dumps(self.json_data(), indent=4)

epstein_files/documents/messenger_log.py CHANGED Viewed

@@ -15,6 +15,7 @@ from epstein_files.util.data import iso_timestamp, listify, sort_dict
 from epstein_files.util.doc_cfg import Metadata, TextCfg
 from epstein_files.util.highlighted_group import get_style_for_name
 from epstein_files.util.logging import logger
+from epstein_files.util.rich import build_table
 CONFIRMED_MSG = 'Found confirmed counterparty'
 GUESSED_MSG = 'This is probably a conversation with'
@@ -111,7 +112,7 @@ class MessengerLog(Communication):
     @classmethod
     def summary_table(cls, imessage_logs: list['MessengerLog']) -> Table:
         """Build a table summarizing the text messages in 'imessage_logs'."""
-        counts_table = Table(title="Text Message Counts By Author", header_style="bold")
+        counts_table = build_table("Text Message Counts By Author")
         counts_table.add_column(AUTHOR.title(), justify='left', style="steel_blue bold", width=30)
         counts_table.add_column('Files', justify='right', style='white')
         counts_table.add_column("Msgs", justify='right')

epstein_files/documents/other_file.py CHANGED Viewed

@@ -20,7 +20,7 @@ from epstein_files.util.data import escape_single_quotes, remove_timezone, uniqu
 from epstein_files.util.file_helper import FILENAME_LENGTH
 from epstein_files.util.env import args
 from epstein_files.util.highlighted_group import get_style_for_category
-from epstein_files.util.rich import QUESTION_MARK_TXT, highlighter
+from epstein_files.util.rich import QUESTION_MARK_TXT, build_table, highlighter
 from epstein_files.util.logging import logger
 MAX_DAYS_SPANNED_TO_BE_VALID = 10
@@ -233,7 +233,7 @@ class OtherFile(Document):
     @staticmethod
     def build_table(docs: list['OtherFile']) -> Table:
         """Build a table of OtherFile documents."""
-        table = Table(header_style='bold', show_lines=True)
+        table = build_table(None, show_lines=True)
         table.add_column('File', justify='center', width=FILENAME_LENGTH)
         table.add_column('Date', justify='center')
         table.add_column('Size', justify='center')

epstein_files/epstein_files.py CHANGED Viewed

@@ -19,7 +19,6 @@ from epstein_files.documents.emails.email_header import AUTHOR
 from epstein_files.documents.json_file import JsonFile
 from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
 from epstein_files.documents.other_file import OtherFile
-from epstein_files.util.constant.output_files import PICKLED_PATH
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
      epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
@@ -29,15 +28,16 @@ from epstein_files.util.doc_cfg import EmailCfg, Metadata
 from epstein_files.util.env import args, logger
 from epstein_files.util.file_helper import DOCS_DIR, file_size_str
 from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
-from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, add_cols_to_table, console, highlighter,
-     link_text_obj, link_markup, print_author_header, print_centered, print_other_site_link, print_panel,
-     print_section_header, vertically_pad)
+from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, TABLE_BORDER_STYLE, add_cols_to_table,
+     build_table, console, highlighter, link_text_obj, link_markup, print_author_header, print_centered,
+     print_other_site_link, print_panel, print_section_header, vertically_pad)
 from epstein_files.util.search_result import SearchResult
 from epstein_files.util.timer import Timer
+EXCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
+PICKLED_PATH = Path("the_epstein_files.pkl.gz")
 DEVICE_SIGNATURE = 'Device Signature'
 DEVICE_SIGNATURE_PADDING = (1, 0)
-NOT_INCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
 SLOW_FILE_SECONDS = 1.0
 INVALID_FOR_EPSTEIN_WEB = JUNK_EMAILERS + KRASSNER_RECIPIENTS + [
@@ -94,23 +94,23 @@ class EpsteinFiles:
         self._tally_email_data()
     @classmethod
-    def get_files(cls, timer: Timer | None = None, use_pickled: bool = False) -> 'EpsteinFiles':
+    def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
         """Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
         timer = timer or Timer()
-        if ((args.pickled or use_pickled) and PICKLED_PATH.exists()) and not args.overwrite_pickle:
+        if PICKLED_PATH.exists() and not args.overwrite_pickle:
             with gzip.open(PICKLED_PATH, 'rb') as file:
                 epstein_files = pickle.load(file)
                 timer.print_at_checkpoint(f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})")
                 epstein_files.timer = timer
                 return epstein_files
+        logger.warning(f"Building new cache file, this will take a few minutes...")
         epstein_files = EpsteinFiles(timer=timer)
-        if args.overwrite_pickle or not PICKLED_PATH.exists():
-            with gzip.open(PICKLED_PATH, 'wb') as file:
-                pickle.dump(epstein_files, file)
-                logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
+        with gzip.open(PICKLED_PATH, 'wb') as file:
+            pickle.dump(epstein_files, file)
+            logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
         timer.print_at_checkpoint(f'Processed {len(epstein_files.all_files):,} documents')
         return epstein_files
@@ -119,9 +119,9 @@ class EpsteinFiles:
         return self.imessage_logs + self.emails + self.other_files
     def all_emailers(self, include_useless: bool = False) -> list[str | None]:
-        """Returns all emailers except Epstein and USELESS_EMAILERS, sorted from least frequent to most."""
+        """Returns all emailers except Epstein and EXCLUDED_EMAILERS, sorted from least frequent to most."""
         names = [a for a in self.email_author_counts.keys()] + [r for r in self.email_recipient_counts.keys()]
-        names = names if include_useless else [e for e in names if e is None or e.lower() not in NOT_INCLUDED_EMAILERS]
+        names = names if include_useless else [e for e in names if e is None or e.lower() not in EXCLUDED_EMAILERS]
         return sorted(list(set(names)), key=lambda e: self.email_author_counts[e] + self.email_recipient_counts[e])
     def attributed_email_count(self) -> int:
@@ -200,10 +200,10 @@ class EpsteinFiles:
     def json_metadata(self) -> str:
         """Create a JSON string containing metadata for all the files."""
         metadata = {
-            EMAIL_CLASS: _sorted_metadata(self.emails),
-            JSON_FILE_CLASS: _sorted_metadata(self.json_files),
-            MESSENGER_LOG_CLASS: _sorted_metadata(self.imessage_logs),
-            OTHER_FILE_CLASS: _sorted_metadata(self.non_json_other_files()),
+            Email.__name__: _sorted_metadata(self.emails),
+            JsonFile.__name__: _sorted_metadata(self.json_files),
+            MessengerLog.__name__: _sorted_metadata(self.imessage_logs),
+            OtherFile.__name__: _sorted_metadata(self.non_json_other_files()),
         }
         return json.dumps(metadata, indent=4, sort_keys=True)
@@ -212,7 +212,7 @@ class EpsteinFiles:
         return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
     def print_files_summary(self) -> None:
-        table = Table(title='Summary of Document Types')
+        table = build_table('Summary of Document Types')
         add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
         def add_row(label: str, docs: list):
@@ -268,12 +268,12 @@ class EpsteinFiles:
     def print_email_device_info(self) -> None:
         print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(4, 0, 0, 0), centered=True)
-        console.print(build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
-        console.print(build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
+        console.print(_build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
+        console.print(_build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
     def print_emailer_counts_table(self) -> None:
         footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
-        counts_table = Table(title=f"Email Counts", caption=footer, header_style="bold")
+        counts_table = build_table("Email Counts", caption=footer)
         add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_MEDIA, EPSTEIN_WEB, 'Twitter'])
         emailer_counts = {
@@ -345,21 +345,6 @@ class EpsteinFiles:
                 self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
-def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
-    title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
-    table = Table(header_style="bold reverse", show_lines=True, title=title)
-    for i, col in enumerate(cols):
-        table.add_column(col.title() + ('s' if i == 1 else ''))
-    new_dict = dict_sets_to_lists(keyed_sets)
-    for k in sorted(new_dict.keys()):
-        table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
-    return Padding(table, DEVICE_SIGNATURE_PADDING)
 def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
     counts: dict[str | None, int] = defaultdict(int)
@@ -372,12 +357,12 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
     return counts
-def document_cls(document: Document) -> Type[Document]:
-    search_area = document.text[0:5000]  # Limit search area to avoid pointless scans of huge files
+def document_cls(doc: Document) -> Type[Document]:
+    search_area = doc.text[0:5000]  # Limit search area to avoid pointless scans of huge files
-    if document.text[0] == '{':
+    if doc.text[0] == '{':
         return JsonFile
-    elif isinstance(document.config, EmailCfg) or DETECT_EMAIL_REGEX.match(search_area):
+    elif isinstance(doc.config, EmailCfg) or (DETECT_EMAIL_REGEX.match(search_area) and doc.config is None):
         return Email
     elif MSG_REGEX.search(search_area):
         return MessengerLog
@@ -397,6 +382,21 @@ def is_ok_for_epstein_web(name: str | None) -> bool:
     return True
+def _build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
+    title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
+    table = build_table(title, header_style="bold reverse", show_lines=True)
+    for i, col in enumerate(cols):
+        table.add_column(col.title() + ('s' if i == 1 else ''))
+    new_dict = dict_sets_to_lists(keyed_sets)
+    for k in sorted(new_dict.keys()):
+        table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
+    return Padding(table, DEVICE_SIGNATURE_PADDING)
 def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
     docs_sorted_by_id = sorted(docs, key=lambda d: d.file_id)
     return [json_safe(d.metadata()) for d in docs_sorted_by_id]

epstein_files/util/constant/output_files.py CHANGED Viewed

@@ -1,20 +1,36 @@
 from pathlib import Path
-PICKLED_PATH = Path("the_epstein_files.pkl.gz")
-EPSTEIN_FILES_NOV_2025 = 'epstein_files_nov_2025'
-URLS_ENV = '.urls.env'
+from epstein_files.util.constant.strings import EMAIL, TEXT_MESSAGE, SiteType
+# Files output by the code
 HTML_DIR = Path('docs')
+EPSTEIN_FILES_NOV_2025 = 'epstein_files_nov_2025'
 ALL_EMAILS_PATH = HTML_DIR.joinpath(f'all_emails_{EPSTEIN_FILES_NOV_2025}.html')
+JSON_FILES_JSON_PATH = HTML_DIR.joinpath(f'json_files_from_{EPSTEIN_FILES_NOV_2025}.json')
 JSON_METADATA_PATH = HTML_DIR.joinpath(f'file_metadata_{EPSTEIN_FILES_NOV_2025}.json')
 TEXT_MSGS_HTML_PATH = HTML_DIR.joinpath('index.html')
 WORD_COUNT_HTML_PATH = HTML_DIR.joinpath(f'communication_word_count_{EPSTEIN_FILES_NOV_2025}.html')
 # EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
+URLS_ENV = '.urls.env'
+# Deployment URLS
+# NOTE: don't rename these variables without changing deploy.sh!
+GH_PAGES_BASE_URL = 'https://michelcrypt4d4mus.github.io'
+TEXT_MSGS_URL = f"{GH_PAGES_BASE_URL}/epstein_text_messages"
+ALL_EMAILS_URL = f"{TEXT_MSGS_URL}/{ALL_EMAILS_PATH.name}"
+JSON_FILES_URL = f"{TEXT_MSGS_URL}/{JSON_FILES_JSON_PATH.name}"
+JSON_METADATA_URL = f"{TEXT_MSGS_URL}/{JSON_METADATA_PATH.name}"
+WORD_COUNT_URL = f"{TEXT_MSGS_URL}/{WORD_COUNT_HTML_PATH.name}"
+SITE_URLS: dict[SiteType, str] = {
+    EMAIL: ALL_EMAILS_URL,
+    TEXT_MESSAGE: TEXT_MSGS_URL,
+}
 BUILD_ARTIFACTS = [
     ALL_EMAILS_PATH,
     # EPSTEIN_WORD_COUNT_HTML_PATH,
+    JSON_FILES_JSON_PATH,
     JSON_METADATA_PATH,
     TEXT_MSGS_HTML_PATH,
     WORD_COUNT_HTML_PATH,

epstein_files/util/constant/strings.py CHANGED Viewed

@@ -2,13 +2,6 @@ import re
 from typing import Literal
-# Document subclass names (this sucks)
-DOCUMENT_CLASS = 'Document'
-EMAIL_CLASS = 'Email'
-JSON_FILE_CLASS = 'JsonFile'
-MESSENGER_LOG_CLASS = 'MessengerLog'
-OTHER_FILE_CLASS = 'OtherFile'
 # categories
 ACADEMIA = 'academia'
 ARTS = 'arts'
@@ -27,6 +20,7 @@ POLITICS = 'politics'
 PROPERTY = 'property'
 PUBLICIST = 'publicist'
 REPUTATION = 'reputation'
+SKYPE_LOG= 'skype log'
 SOCIAL = 'social'
 SPEECH = 'speech'
@@ -55,7 +49,6 @@ TEXT_MESSAGE = 'text message'
 SiteType = Literal['email', 'text message']
 # Styles
-OTHER_SITE_LINK_STYLE = 'dark_goldenrod'
 TIMESTAMP_STYLE = 'turquoise4'
 TIMESTAMP_DIM = f"turquoise4 dim"
@@ -76,5 +69,12 @@ FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}}(_\d{{1,2}})?)")
 FILE_NAME_REGEX = re.compile(fr"{FILE_STEM_REGEX.pattern}(\.txt(\.json)?)?")
 QUESTION_MARKS_REGEX = re.compile(fr' {re.escape(QUESTION_MARKS)}$')
+# Document subclass names (this sucks)
+DOCUMENT_CLASS = 'Document'
+EMAIL_CLASS = 'Email'
+JSON_FILE_CLASS = 'JsonFile'
+MESSENGER_LOG_CLASS = 'MessengerLog'
+OTHER_FILE_CLASS = 'OtherFile'
 remove_question_marks = lambda name: QUESTION_MARKS_REGEX.sub('', name)

epstein_files/util/constant/urls.py CHANGED Viewed

@@ -6,7 +6,6 @@ from inflection import parameterize
 from rich.text import Text
 from epstein_files.util.constant.output_files import *
-from epstein_files.util.constant.strings import EMAIL, TEXT_MESSAGE, SiteType
 from epstein_files.util.file_helper import coerce_file_stem
 # Style stuff
@@ -15,26 +14,11 @@ TEXT_LINK = 'text_link'
 # External site names
 ExternalSite = Literal['epstein.media', 'epsteinify', 'EpsteinWeb']
 EPSTEIN_MEDIA = 'epstein.media'
 EPSTEIN_WEB = 'EpsteinWeb'
 EPSTEINIFY = 'epsteinify'
 JMAIL = 'Jmail'
-# Deployment URLS
-# NOTE: don't rename these variables without changing deploy.sh!
-GH_PAGES_BASE_URL = 'https://michelcrypt4d4mus.github.io'
-TEXT_MSGS_URL = f"{GH_PAGES_BASE_URL}/epstein_text_messages"
-ALL_EMAILS_URL = f'{TEXT_MSGS_URL}/{ALL_EMAILS_PATH.name}'
-JSON_METADATA_URL = f'{TEXT_MSGS_URL}/{JSON_METADATA_PATH.name}'
-WORD_COUNT_URL = f'{TEXT_MSGS_URL}/{WORD_COUNT_HTML_PATH.name}'
-SITE_URLS: dict[SiteType, str] = {
-    EMAIL: ALL_EMAILS_URL,
-    TEXT_MESSAGE: TEXT_MSGS_URL,
-}
 GH_PROJECT_URL = 'https://github.com/michelcrypt4d4mus/epstein_text_messages'
 GH_MASTER_URL = f"{GH_PROJECT_URL}/blob/master"
 ATTRIBUTIONS_URL = f'{GH_MASTER_URL}/epstein_files/util/constants.py'
@@ -46,14 +30,16 @@ extracted_file_url = lambda f: f"{EXTRACTS_BASE_URL}/{f}"
 # External URLs
 COFFEEZILLA_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=061ce61c9e70bdfd'
 COURIER_NEWSROOM_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=092314e384a58618'
-EPSTEINIFY_URL = 'https://epsteinify.com'
-EPSTEIN_MEDIA_URL = 'https://www.epstein.media'
-EPSTEIN_WEB_URL = 'https://epsteinweb.org'
-JMAIL_URL = 'https://jmail.world'
 OVERSIGHT_REPUBLICANS_PRESSER_URL = 'https://oversight.house.gov/release/oversight-committee-releases-additional-epstein-estate-documents/'
 RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL = 'https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_'
 SUBSTACK_URL = 'https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great'
+# Document source sites
+EPSTEINIFY_URL = 'https://epsteinify.com'
+EPSTEIN_MEDIA_URL = 'https://epstein.media'
+EPSTEIN_WEB_URL = 'https://epsteinweb.org'
+JMAIL_URL = 'https://jmail.world'
 DOC_LINK_BASE_URLS: dict[ExternalSite, str] = {
     EPSTEIN_MEDIA: f"{EPSTEIN_MEDIA_URL}/files",
     EPSTEIN_WEB: f'{EPSTEIN_WEB_URL}/wp-content/uploads/epstein_evidence/images',
@@ -61,7 +47,6 @@ DOC_LINK_BASE_URLS: dict[ExternalSite, str] = {
 }
-# TODO: epsteinify.com seems to be down as of 2025-12-30, switched to epstein.web for links
 epsteinify_api_url = lambda file_id: f"{EPSTEINIFY_URL}/api/documents/HOUSE_OVERSIGHT_{file_id}"
 epsteinify_doc_link_markup = lambda filename_or_id, style = TEXT_LINK: external_doc_link_markup(EPSTEINIFY, filename_or_id, style)
 epsteinify_doc_link_txt = lambda filename_or_id, style = TEXT_LINK: Text.from_markup(external_doc_link_markup(filename_or_id, style))

epstein-files 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl

epstein-files 1.0.4py3-none-any.whl → 1.0.6py3-none-any.whl