datamule 2.3.4__tar.gz → 2.3.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamule might be problematic. Click here for more details.

Files changed (75) hide show
  1. {datamule-2.3.4 → datamule-2.3.6}/PKG-INFO +1 -1
  2. {datamule-2.3.4 → datamule-2.3.6}/datamule/__init__.py +5 -3
  3. {datamule-2.3.4 → datamule-2.3.6}/datamule/book/s3transfer.py +1 -1
  4. {datamule-2.3.4 → datamule-2.3.6}/datamule/document/document.py +1 -1
  5. {datamule-2.3.4/datamule → datamule-2.3.6/datamule/portfolio}/portfolio.py +14 -23
  6. datamule-2.3.4/datamule/portfolio_compression_utils.py → datamule-2.3.6/datamule/portfolio/portfolio_compression_utils_legacy.py +2 -0
  7. {datamule-2.3.4 → datamule-2.3.6}/datamule/seclibrary/bq.py +2 -0
  8. {datamule-2.3.4/datamule → datamule-2.3.6/datamule/sheet}/sheet.py +4 -4
  9. {datamule-2.3.4/datamule → datamule-2.3.6/datamule/submission}/submission.py +142 -96
  10. datamule-2.3.6/datamule/submission/tar_submission.py +79 -0
  11. datamule-2.3.6/datamule/tables/__init__.py +0 -0
  12. {datamule-2.3.4/datamule/document → datamule-2.3.6/datamule}/tables/tables.py +6 -6
  13. datamule-2.3.6/datamule/tags/__init__.py +0 -0
  14. datamule-2.3.6/datamule/utils/__init__.py +0 -0
  15. {datamule-2.3.4 → datamule-2.3.6}/datamule.egg-info/PKG-INFO +1 -1
  16. {datamule-2.3.4 → datamule-2.3.6}/datamule.egg-info/SOURCES.txt +20 -16
  17. {datamule-2.3.4 → datamule-2.3.6}/setup.py +1 -1
  18. {datamule-2.3.4 → datamule-2.3.6}/datamule/book/__init__.py +0 -0
  19. {datamule-2.3.4 → datamule-2.3.6}/datamule/book/book.py +0 -0
  20. {datamule-2.3.4/datamule/datamule → datamule-2.3.6/datamule/cloud}/__init__.py +0 -0
  21. {datamule-2.3.4 → datamule-2.3.6}/datamule/config.py +0 -0
  22. {datamule-2.3.4 → datamule-2.3.6}/datamule/data/listed_filer_metadata.csv +0 -0
  23. {datamule-2.3.4/datamule/document → datamule-2.3.6/datamule/datamule}/__init__.py +0 -0
  24. {datamule-2.3.4 → datamule-2.3.6}/datamule/datamule/datamule_lookup.py +0 -0
  25. {datamule-2.3.4 → datamule-2.3.6}/datamule/datamule/datamule_mysql_rds.py +0 -0
  26. {datamule-2.3.4 → datamule-2.3.6}/datamule/datamule/downloader.py +0 -0
  27. {datamule-2.3.4 → datamule-2.3.6}/datamule/datamule/sec_connector.py +0 -0
  28. {datamule-2.3.4 → datamule-2.3.6}/datamule/datasets.py +0 -0
  29. {datamule-2.3.4/datamule/document/tables → datamule-2.3.6/datamule/document}/__init__.py +0 -0
  30. {datamule-2.3.4 → datamule-2.3.6}/datamule/helper.py +0 -0
  31. {datamule-2.3.4 → datamule-2.3.6}/datamule/index.py +0 -0
  32. {datamule-2.3.4 → datamule-2.3.6}/datamule/mapping_dicts/__init__.py +0 -0
  33. {datamule-2.3.4 → datamule-2.3.6}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  34. {datamule-2.3.4 → datamule-2.3.6}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  35. {datamule-2.3.4 → datamule-2.3.6}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  36. {datamule-2.3.4 → datamule-2.3.6}/datamule/package_updater.py +0 -0
  37. {datamule-2.3.4/datamule/sec → datamule-2.3.6/datamule/portfolio}/__init__.py +0 -0
  38. {datamule-2.3.4/datamule/sec/infrastructure → datamule-2.3.6/datamule/sec}/__init__.py +0 -0
  39. {datamule-2.3.4/datamule/sec/submissions → datamule-2.3.6/datamule/sec/infrastructure}/__init__.py +0 -0
  40. {datamule-2.3.4 → datamule-2.3.6}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  41. {datamule-2.3.4/datamule/sec/xbrl → datamule-2.3.6/datamule/sec/submissions}/__init__.py +0 -0
  42. {datamule-2.3.4 → datamule-2.3.6}/datamule/sec/submissions/downloader.py +0 -0
  43. {datamule-2.3.4 → datamule-2.3.6}/datamule/sec/submissions/eftsquery.py +0 -0
  44. {datamule-2.3.4 → datamule-2.3.6}/datamule/sec/submissions/monitor.py +0 -0
  45. {datamule-2.3.4 → datamule-2.3.6}/datamule/sec/submissions/streamer.py +0 -0
  46. {datamule-2.3.4 → datamule-2.3.6}/datamule/sec/submissions/textsearch.py +0 -0
  47. {datamule-2.3.4 → datamule-2.3.6}/datamule/sec/utils.py +0 -0
  48. {datamule-2.3.4/datamule/seclibrary → datamule-2.3.6/datamule/sec/xbrl}/__init__.py +0 -0
  49. {datamule-2.3.4 → datamule-2.3.6}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  50. {datamule-2.3.4 → datamule-2.3.6}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  51. {datamule-2.3.4 → datamule-2.3.6}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  52. {datamule-2.3.4 → datamule-2.3.6}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  53. {datamule-2.3.4/datamule/sentiment → datamule-2.3.6/datamule/seclibrary}/__init__.py +0 -0
  54. {datamule-2.3.4/datamule/tags → datamule-2.3.6/datamule/sheet}/__init__.py +0 -0
  55. {datamule-2.3.4/datamule/utils → datamule-2.3.6/datamule/submission}/__init__.py +0 -0
  56. {datamule-2.3.4/datamule/document → datamule-2.3.6/datamule}/tables/tables_13fhr.py +0 -0
  57. {datamule-2.3.4/datamule/document → datamule-2.3.6/datamule}/tables/tables_25nse.py +0 -0
  58. {datamule-2.3.4/datamule/document → datamule-2.3.6/datamule}/tables/tables_informationtable.py +0 -0
  59. {datamule-2.3.4/datamule/document → datamule-2.3.6/datamule}/tables/tables_npx.py +0 -0
  60. {datamule-2.3.4/datamule/document → datamule-2.3.6/datamule}/tables/tables_ownership.py +0 -0
  61. {datamule-2.3.4/datamule/document → datamule-2.3.6/datamule}/tables/tables_proxyvotingrecord.py +0 -0
  62. {datamule-2.3.4/datamule/document → datamule-2.3.6/datamule}/tables/tables_sbsef.py +0 -0
  63. {datamule-2.3.4/datamule/document → datamule-2.3.6/datamule}/tables/tables_sdr.py +0 -0
  64. {datamule-2.3.4/datamule/document → datamule-2.3.6/datamule}/tables/utils.py +0 -0
  65. {datamule-2.3.4 → datamule-2.3.6}/datamule/tags/config.py +0 -0
  66. {datamule-2.3.4 → datamule-2.3.6}/datamule/tags/dictionaries.py +0 -0
  67. {datamule-2.3.4 → datamule-2.3.6}/datamule/tags/regex.py +0 -0
  68. {datamule-2.3.4 → datamule-2.3.6}/datamule/tags/utils.py +0 -0
  69. {datamule-2.3.4 → datamule-2.3.6}/datamule/utils/construct_submissions_data.py +0 -0
  70. {datamule-2.3.4 → datamule-2.3.6}/datamule/utils/format_accession.py +0 -0
  71. {datamule-2.3.4 → datamule-2.3.6}/datamule/utils/pdf.py +0 -0
  72. {datamule-2.3.4 → datamule-2.3.6}/datamule.egg-info/dependency_links.txt +0 -0
  73. {datamule-2.3.4 → datamule-2.3.6}/datamule.egg-info/requires.txt +0 -0
  74. {datamule-2.3.4 → datamule-2.3.6}/datamule.egg-info/top_level.txt +0 -0
  75. {datamule-2.3.4 → datamule-2.3.6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.3.4
3
+ Version: 2.3.6
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -1,9 +1,9 @@
1
- from .submission import Submission
2
- from .portfolio import Portfolio
1
+ from .submission.submission import Submission
2
+ from .portfolio.portfolio import Portfolio
3
3
  from .document.document import Document
4
4
  from .helper import _load_package_csv, load_package_dataset
5
5
  from .config import Config
6
- from .sheet import Sheet
6
+ from .sheet.sheet import Sheet
7
7
  from .index import Index
8
8
  from .package_updater import PackageUpdater
9
9
  from .utils.format_accession import format_accession
@@ -32,6 +32,8 @@ def _setup_notebook_env():
32
32
  # Set up notebook environment
33
33
  _setup_notebook_env()
34
34
 
35
+
36
+ # TODO, is this load bearing?
35
37
  __all__ = [
36
38
  '_load_package_csv',
37
39
  'load_package_dataset',
@@ -8,7 +8,7 @@ from datetime import datetime, timedelta
8
8
  from urllib.parse import urlparse
9
9
  from tqdm import tqdm
10
10
  import logging
11
- from ..sheet import Sheet
11
+ from ..sheet.sheet import Sheet
12
12
  from ..utils.format_accession import format_accession
13
13
 
14
14
  # Set up logging
@@ -10,7 +10,7 @@ from pathlib import Path
10
10
  import webbrowser
11
11
  from secsgml.utils import bytes_to_str
12
12
  import tempfile
13
- from .tables.tables import Tables
13
+ from ..tables.tables import Tables
14
14
 
15
15
  from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
16
16
  from ..utils.pdf import has_extractable_text
@@ -1,19 +1,19 @@
1
1
  from pathlib import Path
2
2
  from tqdm import tqdm
3
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from .submission import Submission
5
- from .sec.submissions.downloader import download as sec_download
6
- from .sec.submissions.textsearch import filter_text
7
- from .config import Config
4
+ from ..submission.submission import Submission
5
+ from ..sec.submissions.downloader import download as sec_download
6
+ from ..sec.submissions.textsearch import filter_text
7
+ from ..config import Config
8
8
  import os
9
9
  import tarfile
10
10
  from threading import Lock
11
- from .helper import _process_cik_and_metadata_filters
12
- from .datamule.downloader import download as seclibrary_download
13
- from .sec.xbrl.filter_xbrl import filter_xbrl
14
- from .sec.submissions.monitor import Monitor
15
- from .portfolio_compression_utils import CompressionManager
16
- from .datamule.sec_connector import SecConnector
11
+ from ..helper import _process_cik_and_metadata_filters
12
+ from ..datamule.downloader import download as seclibrary_download
13
+ from ..sec.xbrl.filter_xbrl import filter_xbrl
14
+ from ..sec.submissions.monitor import Monitor
15
+ from .portfolio_compression_utils_legacy import CompressionManager
16
+ from ..datamule.sec_connector import SecConnector
17
17
  import shutil
18
18
 
19
19
 
@@ -31,6 +31,7 @@ class Portfolio:
31
31
 
32
32
  self.monitor = Monitor()
33
33
 
34
+
34
35
  if self.path.exists():
35
36
  self._load_submissions()
36
37
  self.submissions_loaded = True
@@ -47,6 +48,7 @@ class Portfolio:
47
48
  regular_items = [f for f in self.path.iterdir() if (f.is_dir() or f.suffix=='.tar') and 'batch' not in f.name]
48
49
  batch_tars = [f for f in self.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
49
50
 
51
+
50
52
  # Load regular submissions (existing logic)
51
53
  def load_submission(folder):
52
54
  return Submission(folder)
@@ -99,11 +101,12 @@ class Portfolio:
99
101
  try:
100
102
  submission = Submission(
101
103
  batch_tar_path=batch_tar_path,
102
- accession_prefix=accession_prefix,
104
+ accession=accession_prefix,
103
105
  portfolio_ref=self
104
106
  )
105
107
  submissions.append(submission)
106
108
  except Exception as e:
109
+ print(f"Path: {batch_tar_path}. Exception: {e}")
107
110
  pass
108
111
  #print(f"Path: {batch_tar_path}. Exception: {e}")
109
112
  pbar.update(1) # Update progress for each successful submission
@@ -111,18 +114,6 @@ class Portfolio:
111
114
  return submissions
112
115
 
113
116
 
114
- def compress(self, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024):
115
- """
116
- Compress all individual submissions into batch tar files.
117
-
118
- Args:
119
- compression: None, 'gzip', or 'zstd' for document compression (default: None)
120
- compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
121
- threshold: Size threshold for compressing individual documents (default: 1MB)
122
- max_batch_size: Maximum size per batch tar file (default: 1GB)
123
- """
124
- CompressionManager().compress_portfolio(self, compression, compression_level, threshold, max_batch_size, self.MAX_WORKERS)
125
-
126
117
  def decompress(self):
127
118
  """
128
119
  Decompress all batch tar files back to individual submission directories.
@@ -8,6 +8,8 @@ from tqdm import tqdm
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
9
  from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
10
10
 
11
+ # probably can delete much of this TODO
12
+
11
13
 
12
14
  class CompressionManager:
13
15
 
@@ -2,6 +2,8 @@ import os
2
2
  import requests
3
3
  import json
4
4
 
5
+ # slated for deletion
6
+
5
7
  def get_information_table(
6
8
  # Optional filtering parameters
7
9
  columns=None,
@@ -1,10 +1,10 @@
1
1
  from pathlib import Path
2
2
  import csv
3
3
  import os
4
- from .helper import _process_cik_and_metadata_filters, load_package_dataset
5
- from .sec.xbrl.downloadcompanyfacts import download_company_facts
6
- from .datamule.datamule_lookup import datamule_lookup
7
- from .datamule.datamule_mysql_rds import query_mysql_rds
4
+ from ..helper import _process_cik_and_metadata_filters, load_package_dataset
5
+ from ..sec.xbrl.downloadcompanyfacts import download_company_facts
6
+ from ..datamule.datamule_lookup import datamule_lookup
7
+ from ..datamule.datamule_mysql_rds import query_mysql_rds
8
8
  from company_fundamentals.utils import get_fundamental_mappings
9
9
  from company_fundamentals import construct_fundamentals
10
10
  class Sheet:
@@ -1,10 +1,10 @@
1
1
  from pathlib import Path
2
2
  import json
3
- from .document.document import Document
3
+ from ..document.document import Document
4
4
  from secsgml import parse_sgml_content_into_memory
5
5
  from secsgml.parse_sgml import transform_metadata_string
6
6
  from secsgml.utils import bytes_to_str
7
- from .sec.utils import headers
7
+ from ..sec.utils import headers
8
8
  import tarfile
9
9
  import zstandard as zstd
10
10
  import gzip
@@ -12,31 +12,85 @@ import urllib.request
12
12
  from secxbrl import parse_inline_xbrl
13
13
  from company_fundamentals import construct_fundamentals
14
14
  from decimal import Decimal
15
- from .utils.format_accession import format_accession
16
-
15
+ from ..utils.format_accession import format_accession
16
+ from .tar_submission import tar_submission
17
+
18
+ # probably needs rework later
19
+ class FundamentalsAccessor:
20
+ def __init__(self, submission):
21
+ self.submission = submission
22
+ self._cache = {}
23
+ self._all_data = None
24
+
25
+ def __getattr__(self, category):
26
+ if category not in self._cache:
27
+ self._cache[category] = self.submission.parse_fundamentals(categories=[category])
28
+ return self._cache[category]
29
+
30
+ def _get_all_data(self):
31
+ if self._all_data is None:
32
+ self._all_data = self.submission.parse_fundamentals(categories=None)
33
+ return self._all_data
34
+
35
+ # Make the accessor behave like the underlying data
36
+ def __getitem__(self, key):
37
+ return self._get_all_data()[key]
38
+
39
+ def __repr__(self):
40
+ return repr(self._get_all_data())
41
+
42
+ def __str__(self):
43
+ return str(self._get_all_data())
44
+
45
+ def __iter__(self):
46
+ return iter(self._get_all_data())
47
+
48
+ def __len__(self):
49
+ return len(self._get_all_data()) if self._get_all_data() else 0
50
+
51
+ def __bool__(self):
52
+ return bool(self._get_all_data())
17
53
 
18
54
  class Submission:
19
55
  def __init__(self, path=None, sgml_content=None, keep_document_types=None,
20
- batch_tar_path=None, accession_prefix=None, portfolio_ref=None,url=None):
56
+ batch_tar_path=None, accession=None, portfolio_ref=None,url=None):
21
57
 
58
+ # get accession number
59
+ # lets just use accesion-prefix, to get around malformed metadata files (1995 has a lot!)
60
+ if path is not None:
61
+ self.accession = format_accession(path.stem,'no-dash')
62
+ elif batch_tar_path is not None:
63
+ self.accession = format_accession(accession,'no-dash')
64
+ elif url is not None or sgml_content is not None:
65
+ if accession is None:
66
+ raise ValueError("If using url or sgml_content, accession must be specified.")
67
+ self.accession = format_accession(accession,'no-dash')
68
+ else:
69
+ raise ValueError("If this appears, please post an issue: https://github.com/john-friedman/datamule-python/issues.")
70
+
22
71
 
23
72
  # declare vars to be filled later
24
73
  self._xbrl = None
25
74
  self._fundamentals_cache = {}
75
+ self._tar = None
76
+ self._tar_compression_type = 'zstd'
77
+ self._tar_compression_level = 3
78
+ self._accession_year_2d = None
79
+ self._documents = None
26
80
 
27
81
  # Validate parameters
28
82
  param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path,url])
29
83
  if param_count != 1:
30
84
  raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
31
85
 
32
- if batch_tar_path is not None and (accession_prefix is None or portfolio_ref is None):
33
- raise ValueError("batch_tar_path requires both accession_prefix and portfolio_ref")
86
+ if batch_tar_path is not None and (self.accession is None or portfolio_ref is None):
87
+ raise ValueError("batch_tar_path requires both accession and portfolio_ref")
34
88
 
35
89
  # Initialize batch tar attributes
36
90
  self.batch_tar_path = batch_tar_path
37
- self.accession_prefix = accession_prefix
38
91
  self.portfolio_ref = portfolio_ref
39
92
 
93
+ # here should set accession either from url or make it a required argument if sgml content
40
94
  if url is not None or sgml_content is not None:
41
95
  if url is not None:
42
96
  request = urllib.request.Request(url, headers=headers)
@@ -49,17 +103,15 @@ class Submission:
49
103
 
50
104
  self.path = None
51
105
  metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
52
- metadata = bytes_to_str(metadata)
106
+ metadata = bytes_to_str(metadata,lower=False)
53
107
 
54
108
  # standardize metadata
55
109
  metadata = transform_metadata_string(metadata)
56
110
 
57
111
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
58
- # code dupe
59
- self.accession = self.metadata.content['accession-number']
60
112
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
61
113
 
62
- self.documents = []
114
+ self.documents_obj_list = []
63
115
  filtered_metadata_documents = []
64
116
 
65
117
  for idx,doc in enumerate(self.metadata.content['documents']):
@@ -72,7 +124,7 @@ class Submission:
72
124
  # write as txt if not declared
73
125
  filename = doc.get('filename','.txt')
74
126
  extension = Path(filename).suffix
75
- self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
127
+ self.documents_obj_list.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
76
128
 
77
129
  filtered_metadata_documents.append(doc)
78
130
 
@@ -85,24 +137,22 @@ class Submission:
85
137
  # Load metadata from batch tar
86
138
  with self.portfolio_ref.batch_tar_locks[batch_tar_path]:
87
139
  tar_handle = self.portfolio_ref.batch_tar_handles[batch_tar_path]
88
- metadata_obj = tar_handle.extractfile(f'{accession_prefix}/metadata.json')
140
+ metadata_obj = tar_handle.extractfile(f'{self.accession}/metadata.json')
89
141
  metadata = json.loads(metadata_obj.read().decode('utf-8'))
90
142
 
91
143
  # Set metadata path using :: notation
92
- metadata_path = f"{batch_tar_path}::{accession_prefix}/metadata.json"
144
+ metadata_path = f"{batch_tar_path}::{self.accession}/metadata.json"
93
145
 
94
146
  # standardize metadata
95
147
  metadata = transform_metadata_string(metadata)
96
148
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
97
-
98
- # lets just use accesion-prefix, to get around malformed metadata files (1995 has a lot!)
99
- self.accession = format_accession(self.accession_prefix,'dash')
100
149
 
101
- #print(f"s: {self.metadata.content['accession-number']} : {batch_tar_path}")
102
150
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
103
151
 
104
152
  elif path is not None:
105
153
  self.path = Path(path)
154
+
155
+
106
156
  if self.path.suffix == '.tar':
107
157
  with tarfile.open(self.path,'r') as tar:
108
158
  metadata_obj = tar.extractfile('metadata.json')
@@ -118,65 +168,45 @@ class Submission:
118
168
  # standardize metadata
119
169
  metadata = transform_metadata_string(metadata)
120
170
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
121
- self.accession = self.metadata.content['accession-number']
122
171
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
123
172
 
124
173
 
125
174
  # booleans
126
- self._has_xbrl = any(
175
+ self._xbrl_bool = any(
127
176
  doc['type'] in ('EX-100.INS', 'EX-101.INS') or
128
177
  doc.get('filename', '').endswith('_htm.xml')
129
178
  for doc in self.metadata.content['documents']
130
179
  )
131
180
 
132
- self._has_fundamentals = self._has_xbrl
181
+ self._has_fundamentals = self._xbrl_bool
133
182
 
183
+
184
+ # TODO rework for better metadata accessing
134
185
  def _load_document_by_index(self, idx):
135
186
  """Load a document by its index in the metadata documents list."""
136
187
  doc = self.metadata.content['documents'][idx]
137
188
 
138
189
  # If loaded from sgml_content, return pre-loaded document
139
190
  if self.path is None and self.batch_tar_path is None:
140
- return self.documents[idx]
191
+ return self.documents_obj_list[idx]
141
192
 
142
193
  # Get filename from metadata - this is the source of truth
143
194
  filename = doc.get('filename')
144
195
  if filename is None:
145
196
  filename = doc['sequence'] + '.txt'
146
197
 
147
- # Get the base extension (before any compression extension)
148
- # If filename ends with .gz or .zst, the real extension is before that
149
- if filename.endswith('.gz'):
150
- extension = Path(filename[:-3]).suffix
151
- is_compressed = 'gzip'
152
- elif filename.endswith('.zst'):
153
- extension = Path(filename[:-4]).suffix
154
- is_compressed = 'zstd'
155
- else:
156
- extension = Path(filename).suffix
157
- is_compressed = False
158
-
198
+ extension = Path(filename).suffix
159
199
  # Handle batch tar case
160
200
  if self.batch_tar_path is not None:
161
201
  with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
162
202
  tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
163
203
 
164
204
  # Use exact filename from metadata
165
- tar_path = f'{self.accession_prefix}/{filename}'
205
+ tar_path = f'{self.accession}/{filename}'
166
206
  content = tar_handle.extractfile(tar_path).read()
167
207
 
168
208
 
169
- # Decompress if needed based on filename extension
170
- if is_compressed == 'gzip':
171
- content = gzip.decompress(content)
172
- elif is_compressed == 'zstd':
173
- content = zstd.ZstdDecompressor().decompress(content)
174
-
175
- # Decode text files
176
- # if extension in ['.htm', '.html', '.txt', '.xml']:
177
- # content = content.decode('utf-8', errors='replace')
178
-
179
- document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
209
+ document_path = f"{self.batch_tar_path}::{self.accession}/{filename}"
180
210
 
181
211
  # Handle regular path case
182
212
  else:
@@ -188,27 +218,7 @@ class Submission:
188
218
  content = tar.extractfile(filename).read()
189
219
  actual_filename = filename
190
220
  except:
191
- try:
192
- content = tar.extractfile(filename + '.gz').read()
193
- actual_filename = filename + '.gz'
194
- is_compressed = 'gzip'
195
- except:
196
- try:
197
- content = tar.extractfile(filename + '.zst').read()
198
- actual_filename = filename + '.zst'
199
- is_compressed = 'zstd'
200
- except:
201
- raise FileNotFoundError(f"Document file not found in tar: {filename}")
202
-
203
- # Decompress if compressed
204
- if is_compressed == 'gzip':
205
- content = gzip.decompress(content)
206
- elif is_compressed == 'zstd':
207
- content = zstd.ZstdDecompressor().decompress(content)
208
-
209
- # Decode text files
210
- # if extension in ['.htm', '.html', '.txt', '.xml']:
211
- # content = content.decode('utf-8', errors='replace')
221
+ raise FileNotFoundError(f"Document file not found in tar: {filename}")
212
222
 
213
223
  document_path = f"{self.path}::{actual_filename}"
214
224
 
@@ -222,15 +232,6 @@ class Submission:
222
232
  with document_path.open('rb') as f:
223
233
  content = f.read()
224
234
 
225
- # Decompress if needed based on filename extension
226
- if is_compressed == 'gzip':
227
- content = gzip.decompress(content)
228
- elif is_compressed == 'zstd':
229
- content = zstd.ZstdDecompressor().decompress(content)
230
-
231
- # Decode text files
232
- # if extension in ['.htm', '.html', '.txt', '.xml']:
233
- # content = content.decode('utf-8', errors='replace')
234
235
 
235
236
  return Document(
236
237
  type=doc['type'],
@@ -260,20 +261,24 @@ class Submission:
260
261
  def parse_xbrl(self):
261
262
  if self._xbrl:
262
263
  return
263
-
264
+
265
+ if not self._xbrl_bool:
266
+ print(f"Submission: {self.accession} has no xbrl")
267
+ return
268
+
264
269
  for idx, doc in enumerate(self.metadata.content['documents']):
265
270
  if doc['type'] in ['EX-100.INS','EX-101.INS']:
266
271
  document = self._load_document_by_index(idx)
267
272
  self._xbrl = parse_inline_xbrl(content=document.content,file_type='extracted_inline')
268
- return
269
-
273
+ return
274
+
270
275
  if doc['filename'].endswith('_htm.xml'):
271
276
  document = self._load_document_by_index(idx)
272
277
  self._xbrl = parse_inline_xbrl(content=document.content,file_type='extracted_inline')
273
278
  return
274
279
 
275
280
  @property
276
- def xbrl(self):
281
+ def xbrl(self):
277
282
  if self._xbrl is None:
278
283
  self.parse_xbrl()
279
284
  return self._xbrl
@@ -353,20 +358,61 @@ class Submission:
353
358
 
354
359
  @property
355
360
  def fundamentals(self):
356
- """Get all fundamental data"""
357
- return self.parse_fundamentals(categories=None)
358
-
359
- def __getattr__(self, name):
360
- # Check if it's a fundamentals property request
361
- if name.endswith('_fundamentals'):
362
- category = name.replace('_fundamentals', '')
363
- return self.parse_fundamentals(categories=[category])
361
+ """Access fundamentals via attributes: sub.fundamentals.incomeStatement"""
362
+ if not hasattr(self, '_fundamentals_accessor'):
363
+ self._fundamentals_accessor = FundamentalsAccessor(self)
364
+ return self._fundamentals_accessor
365
+
366
+ @property
367
+ def tar(self):
368
+ return self._tar_submission().getvalue()
369
+
370
+ def set_tar_compression(self,compression_type='zstd',level=3):
371
+ self._tar_compression_type = compression_type
372
+ self._tar_compression_level = level
373
+
374
+ def _tar_submission(self):
375
+ if self._tar is not None:
376
+ return self._tar
377
+ else:
378
+ documents_obj_list = self._get_documents_obj_list()
379
+ self._tar = tar_submission(
380
+ documents_obj_list=documents_obj_list,
381
+ metadata=self.metadata.content,
382
+ compression_type=self._tar_compression_type,
383
+ level=self._tar_compression_level
384
+ )
385
+ return self._tar
364
386
 
365
- # For any other unknown attribute, try it as a fundamentals category
366
- # Let parse_fundamentals handle whether it's valid or not
367
- result = self.parse_fundamentals(categories=[name])
368
- if result is not None:
369
- return result
387
+ @property
388
+ def accession_year_2d(self):
389
+ return self._get_accession_year_2d()
390
+
391
+ def _get_accession_year_2d(self):
392
+ if self._accession_year_2d is not None:
393
+ return self._accession_year_2d
394
+ self._accession_year_2d = format_accession(self.accession,'dash').split('-')[1]
395
+ return self._accession_year_2d
396
+
397
+ @property
398
+ def documents(self):
399
+ return self._get_documents()
400
+
401
+ def _get_documents(self):
402
+ if self._documents is not None:
403
+ return self._documents
404
+
405
+ self._documents = self.metadata.content['documents']
406
+ return self._documents
407
+
408
+ def _get_documents_obj_list(self):
409
+ """Get all documents as Document objects"""
410
+ if hasattr(self, 'documents_obj_list'):
411
+ return self.documents_obj_list
412
+
413
+ # Generate documents_obj_list for batch tar and path cases
414
+ documents_obj_list = []
415
+ for idx in range(len(self.metadata.content['documents'])):
416
+ documents_obj_list.append(self._load_document_by_index(idx))
370
417
 
371
- # Only raise AttributeError if parse_fundamentals returns None/empty
372
- raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
418
+ return documents_obj_list
@@ -0,0 +1,79 @@
1
+ import zstandard as zstd
2
+ from secsgml.utils import calculate_documents_locations_in_tar
3
+ import tarfile
4
+ import io
5
+ import json
6
+
7
+ # Note: we don't actually need accession at this level. TODO
8
+
9
+ def compress_content(content, compression_type, level):
10
+ if compression_type == 'zstd':
11
+ # Create compressor with specified level
12
+ compressor = zstd.ZstdCompressor(level=level)
13
+
14
+ # Handle string content
15
+ # This should never be called
16
+ if isinstance(content, str):
17
+ content_bytes = content.encode('utf-8')
18
+ else:
19
+ content_bytes = content
20
+
21
+ # Compress and return
22
+ return compressor.compress(content_bytes)
23
+
24
+ # Return uncompressed if not zstd
25
+ return content
26
+
27
+ def compress_content_list(document_tuple_list, compression_type, level):
28
+ if compression_type is None:
29
+ return document_tuple_list
30
+
31
+ if level is None:
32
+ level = 3
33
+
34
+ # Create new list to avoid modifying original
35
+ compressed_list = []
36
+ for document_tuple in document_tuple_list:
37
+ content = document_tuple[0]
38
+ accession = document_tuple[1]
39
+ compressed_content = compress_content(content, compression_type, level)
40
+ compressed_list.append((compressed_content, accession))
41
+
42
+ return compressed_list
43
+
44
+ def tar_content_list(metadata, document_tuple_list_compressed):
45
+ # Update metadata with compressed sizes
46
+ for i, (content, accession) in enumerate(document_tuple_list_compressed):
47
+ metadata['documents'][i]['secsgml_size_bytes'] = len(content)
48
+
49
+ metadata = calculate_documents_locations_in_tar(metadata)
50
+
51
+ tar_buffer = io.BytesIO()
52
+ with tarfile.open(fileobj=tar_buffer, mode='w') as tar:
53
+ # Add metadata first
54
+ metadata_json = json.dumps(metadata).encode('utf-8')
55
+ tarinfo = tarfile.TarInfo(f'metadata.json')
56
+ tarinfo.size = len(metadata_json)
57
+ tar.addfile(tarinfo, io.BytesIO(metadata_json))
58
+
59
+ # Add each content
60
+ for i, (content, accession) in enumerate(document_tuple_list_compressed):
61
+ doc = metadata['documents'][i]
62
+ filename = doc.get('filename', doc['sequence'] + '.txt')
63
+
64
+ tarinfo = tarfile.TarInfo(name=filename)
65
+ tarinfo.size = len(content)
66
+ tar.addfile(tarinfo, io.BytesIO(content))
67
+
68
+ # Return the tar buffer
69
+ tar_buffer.seek(0) # Reset buffer position to beginning
70
+ return tar_buffer
71
+
72
+ def tar_submission(metadata, documents_obj_list, compression_type=None, level=None):
73
+ """Takes a list of documents, compresses them, then tars them."""
74
+ document_tuple_list = [(doc.content, doc.accession) for doc in documents_obj_list]
75
+ document_tuple_list_compressed = compress_content_list(document_tuple_list, # Fixed: correct parameter name
76
+ compression_type=compression_type,
77
+ level=level)
78
+
79
+ return tar_content_list(metadata, document_tuple_list_compressed)
File without changes
@@ -1,5 +1,5 @@
1
1
  from .tables_ownership import config_ownership
2
- from .tables_13fhr import mapping_13fhr
2
+ from .tables_13fhr import config_13fhr
3
3
  from .tables_informationtable import config_information_table
4
4
  from .tables_25nse import config_25nse
5
5
  from .tables_npx import config_npx
@@ -18,10 +18,10 @@ all_tables_dict = {
18
18
  '4/A' : config_ownership,
19
19
  '5' : config_ownership,
20
20
  '5/A' : config_ownership,
21
- '13F-HR' : mapping_13fhr,
22
- '13F-HR/A' : mapping_13fhr,
23
- '13F-NT' : mapping_13fhr,
24
- '13F-NT/A' : mapping_13fhr,
21
+ '13F-HR' : config_13fhr,
22
+ '13F-HR/A' : config_13fhr,
23
+ '13F-NT' : config_13fhr,
24
+ '13F-NT/A' : config_13fhr,
25
25
  'INFORMATION TABLE' : config_information_table,
26
26
  '25-NSE' : config_25nse,
27
27
  '25-NSE/A' : config_25nse,
@@ -155,4 +155,4 @@ class Tables():
155
155
  matching_tables.append(table)
156
156
  continue
157
157
 
158
- return matching_tables
158
+ return matching_tables
File without changes
File without changes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.3.4
3
+ Version: 2.3.6
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -5,10 +5,6 @@ datamule/datasets.py
5
5
  datamule/helper.py
6
6
  datamule/index.py
7
7
  datamule/package_updater.py
8
- datamule/portfolio.py
9
- datamule/portfolio_compression_utils.py
10
- datamule/sheet.py
11
- datamule/submission.py
12
8
  datamule.egg-info/PKG-INFO
13
9
  datamule.egg-info/SOURCES.txt
14
10
  datamule.egg-info/dependency_links.txt
@@ -17,6 +13,7 @@ datamule.egg-info/top_level.txt
17
13
  datamule/book/__init__.py
18
14
  datamule/book/book.py
19
15
  datamule/book/s3transfer.py
16
+ datamule/cloud/__init__.py
20
17
  datamule/data/listed_filer_metadata.csv
21
18
  datamule/datamule/__init__.py
22
19
  datamule/datamule/datamule_lookup.py
@@ -25,21 +22,13 @@ datamule/datamule/downloader.py
25
22
  datamule/datamule/sec_connector.py
26
23
  datamule/document/__init__.py
27
24
  datamule/document/document.py
28
- datamule/document/tables/__init__.py
29
- datamule/document/tables/tables.py
30
- datamule/document/tables/tables_13fhr.py
31
- datamule/document/tables/tables_25nse.py
32
- datamule/document/tables/tables_informationtable.py
33
- datamule/document/tables/tables_npx.py
34
- datamule/document/tables/tables_ownership.py
35
- datamule/document/tables/tables_proxyvotingrecord.py
36
- datamule/document/tables/tables_sbsef.py
37
- datamule/document/tables/tables_sdr.py
38
- datamule/document/tables/utils.py
39
25
  datamule/mapping_dicts/__init__.py
40
26
  datamule/mapping_dicts/html_mapping_dicts.py
41
27
  datamule/mapping_dicts/txt_mapping_dicts.py
42
28
  datamule/mapping_dicts/xml_mapping_dicts.py
29
+ datamule/portfolio/__init__.py
30
+ datamule/portfolio/portfolio.py
31
+ datamule/portfolio/portfolio_compression_utils_legacy.py
43
32
  datamule/sec/__init__.py
44
33
  datamule/sec/utils.py
45
34
  datamule/sec/infrastructure/__init__.py
@@ -57,7 +46,22 @@ datamule/sec/xbrl/streamcompanyfacts.py
57
46
  datamule/sec/xbrl/xbrlmonitor.py
58
47
  datamule/seclibrary/__init__.py
59
48
  datamule/seclibrary/bq.py
60
- datamule/sentiment/__init__.py
49
+ datamule/sheet/__init__.py
50
+ datamule/sheet/sheet.py
51
+ datamule/submission/__init__.py
52
+ datamule/submission/submission.py
53
+ datamule/submission/tar_submission.py
54
+ datamule/tables/__init__.py
55
+ datamule/tables/tables.py
56
+ datamule/tables/tables_13fhr.py
57
+ datamule/tables/tables_25nse.py
58
+ datamule/tables/tables_informationtable.py
59
+ datamule/tables/tables_npx.py
60
+ datamule/tables/tables_ownership.py
61
+ datamule/tables/tables_proxyvotingrecord.py
62
+ datamule/tables/tables_sbsef.py
63
+ datamule/tables/tables_sdr.py
64
+ datamule/tables/utils.py
61
65
  datamule/tags/__init__.py
62
66
  datamule/tags/config.py
63
67
  datamule/tags/dictionaries.py
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="2.3.4",
35
+ version="2.3.6",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes