PyPI - epstein-files - Versions diffs - 1.2.1__py3-none-any.whl → 1.4.1__py3-none-any.whl - Mend

epstein-files 1.2.1py3-none-any.whl → 1.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

epstein_files/__init__.py +55 -11
epstein_files/documents/document.py +13 -2
epstein_files/documents/email.py +329 -258
epstein_files/documents/emails/email_header.py +17 -8
epstein_files/documents/other_file.py +8 -6
epstein_files/epstein_files.py +18 -4
epstein_files/person.py +65 -20
epstein_files/util/constant/names.py +18 -12
epstein_files/util/constant/output_files.py +8 -5
epstein_files/util/constant/strings.py +4 -2
epstein_files/util/constant/urls.py +13 -2
epstein_files/util/constants.py +486 -224
epstein_files/util/data.py +1 -0
epstein_files/util/doc_cfg.py +33 -27
epstein_files/util/env.py +18 -8
epstein_files/util/file_helper.py +2 -0
epstein_files/util/highlighted_group.py +321 -132
epstein_files/util/output.py +19 -24
epstein_files/util/rich.py +9 -3
epstein_files/util/word_count.py +2 -2
{epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/METADATA +3 -3
epstein_files-1.4.1.dist-info/RECORD +34 -0
{epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/entry_points.txt +1 -1
epstein_files-1.2.1.dist-info/RECORD +0 -34
{epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/LICENSE +0 -0
{epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/WHEEL +0 -0

epstein_files/util/data.py CHANGED Viewed

@@ -22,6 +22,7 @@ ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTAN
 PACIFIC_TZ = tz.gettz("America/Los_Angeles")
 TIMEZONE_INFO = {"PDT": PACIFIC_TZ, "PST": PACIFIC_TZ}  # Suppresses annoying warnings from parse() calls
+all_elements_same = lambda _list: len(_list) == 0 or all(x == _list[0] for x in _list)
 collapse_newlines = lambda text: MULTINEWLINE_REGEX.sub('\n\n', text)
 date_str = lambda dt: dt.isoformat()[0:10] if dt else None
 escape_double_quotes = lambda text: text.replace('"', r'\"')

epstein_files/util/doc_cfg.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import json
 import re
 from copy import deepcopy
 from dataclasses import Field, asdict, dataclass, field, fields
@@ -9,20 +10,21 @@ from dateutil.parser import parse
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
 from epstein_files.util.data import remove_zero_time, without_falsey
+from epstein_files.util.env import args
-DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
+DuplicateType = Literal['bounced', 'earlier', 'quoted', 'redacted', 'same']
 Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
 # Misc
-CONSTANTIZE_NAMES = False  # A flag set to True that causes repr() of these classes to return strings of usable code
 INDENT = '    '
 INDENT_NEWLINE = f'\n{INDENT}'
 INDENTED_JOIN = f',{INDENT_NEWLINE}'
-MAX_LINE_LENGTH = 150
+MAX_LINE_LENGTH = 135
 REPUTATION_MGMT = f'{REPUTATION} management'
 SAME = 'same'
 DUPE_TYPE_STRS: dict[DuplicateType, str] = {
+    'bounced': 'a bounced copy of',
     'earlier': 'an earlier draft of',
     'quoted': 'quoted in full in',
     'redacted': 'a redacted version of',
@@ -32,7 +34,10 @@ DUPE_TYPE_STRS: dict[DuplicateType, str] = {
 FIELD_SORT_KEY = {
     'id': 'a',
     'author': 'aa',
-    'attribution_reason': 'zz',
+    'comment': 'zz',
+    'duplicate_ids': 'dup',
+    'duplicate_of_id': 'dupe',
+    'recipients': 'aaa',
 }
 FINANCIAL_REPORTS_AUTHORS = [
@@ -49,7 +54,6 @@ FINANCIAL_REPORTS_AUTHORS = [
 # Fields like timestamp and author are better added from the Document object
 NON_METADATA_FIELDS = [
     'actual_text',
-    'date',
     'id',
     'is_synthetic',
 ]
@@ -64,18 +68,18 @@ class DocCfg:
         id (str): ID of file
         author (Name): Author of the document (if any)
         category (str | None): Type of file
-        date (str | None): If passed will be immediated parsed into the 'timestamp' field
+        date (str | None): Parsed to a datetime by timestamp() if it exists
         dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
         duplicate_ids (list[str]): IDs of *other* documents that are dupes of this document
         duplicate_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
         is_interesting (bool | None): Override other considerations and always consider this file interesting (or not)
-        timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
         is_synthetic (bool): True if this config was generated by the duplicate_cfgs() method
     """
     id: str
     attached_to_email_id: str | None = None
     author: Name = None
     category: str | None = None
+    comment: str = ''
     date: str | None = None
     description: str | None = None
     dupe_type: DuplicateType | None = None
@@ -84,12 +88,8 @@ class DocCfg:
     is_attribution_uncertain: bool = False
     is_interesting: bool | None = None
     is_synthetic: bool = False
-    timestamp: datetime | None = None
     def __post_init__(self):
-        if self.date:
-            self.timestamp = parse(self.date)
         if self.duplicate_of_id or self.duplicate_ids:
             self.dupe_type = self.dupe_type or SAME
@@ -142,7 +142,16 @@ class DocCfg:
             yield dupe_cfg
     def metadata(self) -> Metadata:
-        return {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
+        metadata = {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
+        if self.is_interesting is False:
+            metadata['is_interesting'] = False
+        return metadata
+    def timestamp(self) -> datetime | None:
+        if self.date:
+            return parse(self.date)
     def _props_strs(self) -> list[str]:
         props = []
@@ -151,20 +160,16 @@ class DocCfg:
         for _field in sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name)):
             value = getattr(self, _field.name)
-            if value is None or value is False or (isinstance(value, list) and len(value) == 0):
+            if _field.name in ['actual_text', 'is_fwded_article', 'is_interesting']:  # fields can be False or None or ''
+                if value is not None:
+                    add_prop(_field, str(value))
+            elif not value or _field.name == 'dupe_type' and value == 'same':
                 continue
             elif _field.name == AUTHOR:
-                add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
-            elif _field.name == 'category' and value in [EMAIL, TEXT_MESSAGE]:
-                continue
-            elif _field.name == 'recipients' and value:
-                recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
-                add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
-            elif _field.name == 'timestamp' and self.date is not None:
-                continue  # Don't print both timestamp and date
-            elif isinstance(value, datetime):
-                value_str = remove_zero_time(value)
-                add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
+                add_prop(_field, constantize_name(str(value)) if args.constantize else f"'{value}'")
+            elif _field.name == 'recipients':
+                recipients_str = str([constantize_name(r) if (args.constantize and r) else r for r in value])
+                add_prop(_field, recipients_str.replace("'", '') if args.constantize else recipients_str)
             elif isinstance(value, str):
                 if "'" in value:
                     value = '"' + value.replace('"', r'\"') + '"'
@@ -182,14 +187,14 @@ class DocCfg:
         type_str = f"{type(self).__name__}("
         single_line_repr = type_str + ', '.join(props) + f')'
-        if len(single_line_repr) < MAX_LINE_LENGTH:
+        if len(single_line_repr) < MAX_LINE_LENGTH or (self.comment and getattr(self, 'is_fwded_article')):
             repr_str = single_line_repr
         else:
             repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
             repr_str += ',' if props else ''
             repr_str += '\n)'
-        if CONSTANTIZE_NAMES:
+        if args.constantize:
             repr_str = INDENT + INDENT_NEWLINE.join(repr_str.split('\n'))
             return repr_str.replace(',,', ',').replace(',),', '),').replace(',),', '),')
         else:
@@ -224,9 +229,10 @@ class EmailCfg(CommunicationCfg):
     """
     actual_text: str | None = None
     fwded_text_after: str | None = None
-    is_fwded_article: bool = False
+    is_fwded_article: bool | None = None
     recipients: list[Name] = field(default_factory=list)
     subject: str | None = None
+    truncate_to: int | None = None
     # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
     def __repr__(self) -> str:

epstein_files/util/env.py CHANGED Viewed

@@ -38,7 +38,7 @@ output.add_argument('--all-emails', '-ae', action='store_true', help='all the em
 output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
 parser.add_argument('--build', '-b', nargs="?", default=None, const=DEFAULT_FILE, help='write output to HTML file')
 output.add_argument('--email-timeline', action='store_true', help='print a table of all emails in chronological order')
-output.add_argument('--emailers-info', action='store_true', help='write a .png of the eeailers info table')
+output.add_argument('--emailers-info', '-ei', action='store_true', help='write a .png of the eeailers info table')
 output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
 output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
 output.add_argument('--output-emails', '-oe', action='store_true', help='generate emails section')
@@ -49,43 +49,51 @@ output.add_argument('--suppress-output', action='store_true', help='no output to
 output.add_argument('--uninteresting', action='store_true', help='only output uninteresting other files')
 output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
-scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_search, epstein_show, and epstein_diff.')
+scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_grep, epstein_show, and epstein_diff.')
 scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
+scripts.add_argument('--email-body', action='store_true', help='epstein_grep but only for the body of the email')
+scripts.add_argument('--min-line-length', type=int, help='epstein_grep minimum length of a matched line')
 scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (used by epstein_show)')
 scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole files')
 debug = parser.add_argument_group('DEBUG')
 debug.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
+debug.add_argument('--constantize', action='store_true', help='constantize names when printing repr() of objects')
 debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
 debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
 debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats about the files')
 debug.add_argument('--skip-other-files', '-sof', action='store_true', help='skip parsing non email/text files')
 debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
+debug.add_argument('--truncate', '-t', type=int, help='truncate emails to this many characters')
+debug.add_argument('--write-txt', '-wt', action='store_true', help='write a plain text version of output')
 # Parse args
 args = parser.parse_args()
 is_html_script = parser.prog in HTML_SCRIPTS
-args.build = args.build
 args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
 args.names = [None if n == 'None' else n.strip() for n in (args.names or [])]
 args.output_emails = args.output_emails or args.all_emails
 args.output_other = args.output_other or args.all_other_files or args.uninteresting
 args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
 args.width = args.width if is_html_script else None
+args.any_output_selected = any([is_output_arg(arg) and val for arg, val in vars(args).items()])
+if not (args.any_output_selected or args.email_timeline or args.emailers_info):
+    if is_html_script:
+        logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
+    args.output_emails = args.output_other = args.output_texts = True
 if is_html_script:
     if args.positional_args:
         exit_with_error(f"{parser.prog} does not accept positional arguments (receeived {args.positional_args})")
     if parser.prog == EPSTEIN_GENERATE:
-        if any([is_output_arg(arg) and val for arg, val in vars(args).items()]):
+        if args.any_output_selected:
             if args.email_timeline:
                 exit_with_error(f"--email-timeline option is mutually exlusive with other output options")
-        elif not args.email_timeline and not args.emailers_info:
-            logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
-            args.output_texts = args.output_emails = args.output_other = True
     if args.build == DEFAULT_FILE:
         if args.all_emails:
@@ -94,13 +102,15 @@ if is_html_script:
             args.build = CHRONOLOGICAL_EMAILS_PATH
         else:
             args.build = TEXT_MSGS_HTML_PATH
-elif parser.prog.startswith('epstein_') and not args.positional_args:
+elif parser.prog.startswith('epstein_') and not args.positional_args and not args.names:
     exit_with_error(f"{parser.prog} requires positional arguments but got none!")
 if args.names:
     logger.warning(f"Output restricted to {args.names}")
     args.output_other = False
+if args.truncate and args.whole_file:
+    exit_with_error(f"--whole-file and --truncate are incompatible")
 # Log level args
 if args.deep_debug:

epstein_files/util/file_helper.py CHANGED Viewed

@@ -38,6 +38,8 @@ def extract_file_id(filename_or_id: int | str | Path) -> str:
     if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
         return id_str(filename_or_id)
+    elif isinstance(filename_or_id, str) and len(filename_or_id) == 8:
+        return f"{HOUSE_OVERSIGHT_PREFIX}{filename_or_id}"
     file_match = FILE_ID_REGEX.match(str(filename_or_id).upper())

epstein-files 1.2.1__py3-none-any.whl → 1.4.1__py3-none-any.whl

epstein-files 1.2.1py3-none-any.whl → 1.4.1py3-none-any.whl