datamule 2.3.5__py3-none-any.whl → 2.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamule might be problematic. Click here for more details.

Files changed (28) hide show
  1. datamule/__init__.py +5 -3
  2. datamule/book/s3transfer.py +1 -1
  3. datamule/document/document.py +1 -1
  4. datamule/{portfolio.py → portfolio/portfolio.py} +14 -23
  5. datamule/{portfolio_compression_utils.py → portfolio/portfolio_compression_utils_legacy.py} +2 -0
  6. datamule/seclibrary/bq.py +2 -0
  7. datamule/sheet/__init__.py +0 -0
  8. datamule/{sheet.py → sheet/sheet.py} +4 -4
  9. datamule/submission/__init__.py +0 -0
  10. datamule/{submission.py → submission/submission.py} +150 -97
  11. datamule/submission/tar_submission.py +79 -0
  12. datamule/tables/__init__.py +0 -0
  13. {datamule-2.3.5.dist-info → datamule-2.3.7.dist-info}/METADATA +1 -1
  14. {datamule-2.3.5.dist-info → datamule-2.3.7.dist-info}/RECORD +28 -24
  15. /datamule/{document/tables → cloud}/__init__.py +0 -0
  16. /datamule/{sentiment → portfolio}/__init__.py +0 -0
  17. /datamule/{document/tables → tables}/tables.py +0 -0
  18. /datamule/{document/tables → tables}/tables_13fhr.py +0 -0
  19. /datamule/{document/tables → tables}/tables_25nse.py +0 -0
  20. /datamule/{document/tables → tables}/tables_informationtable.py +0 -0
  21. /datamule/{document/tables → tables}/tables_npx.py +0 -0
  22. /datamule/{document/tables → tables}/tables_ownership.py +0 -0
  23. /datamule/{document/tables → tables}/tables_proxyvotingrecord.py +0 -0
  24. /datamule/{document/tables → tables}/tables_sbsef.py +0 -0
  25. /datamule/{document/tables → tables}/tables_sdr.py +0 -0
  26. /datamule/{document/tables → tables}/utils.py +0 -0
  27. {datamule-2.3.5.dist-info → datamule-2.3.7.dist-info}/WHEEL +0 -0
  28. {datamule-2.3.5.dist-info → datamule-2.3.7.dist-info}/top_level.txt +0 -0
datamule/__init__.py CHANGED
@@ -1,9 +1,9 @@
1
- from .submission import Submission
2
- from .portfolio import Portfolio
1
+ from .submission.submission import Submission
2
+ from .portfolio.portfolio import Portfolio
3
3
  from .document.document import Document
4
4
  from .helper import _load_package_csv, load_package_dataset
5
5
  from .config import Config
6
- from .sheet import Sheet
6
+ from .sheet.sheet import Sheet
7
7
  from .index import Index
8
8
  from .package_updater import PackageUpdater
9
9
  from .utils.format_accession import format_accession
@@ -32,6 +32,8 @@ def _setup_notebook_env():
32
32
  # Set up notebook environment
33
33
  _setup_notebook_env()
34
34
 
35
+
36
+ # TODO, is this load bearing?
35
37
  __all__ = [
36
38
  '_load_package_csv',
37
39
  'load_package_dataset',
@@ -8,7 +8,7 @@ from datetime import datetime, timedelta
8
8
  from urllib.parse import urlparse
9
9
  from tqdm import tqdm
10
10
  import logging
11
- from ..sheet import Sheet
11
+ from ..sheet.sheet import Sheet
12
12
  from ..utils.format_accession import format_accession
13
13
 
14
14
  # Set up logging
@@ -10,7 +10,7 @@ from pathlib import Path
10
10
  import webbrowser
11
11
  from secsgml.utils import bytes_to_str
12
12
  import tempfile
13
- from .tables.tables import Tables
13
+ from ..tables.tables import Tables
14
14
 
15
15
  from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
16
16
  from ..utils.pdf import has_extractable_text
@@ -1,19 +1,19 @@
1
1
  from pathlib import Path
2
2
  from tqdm import tqdm
3
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from .submission import Submission
5
- from .sec.submissions.downloader import download as sec_download
6
- from .sec.submissions.textsearch import filter_text
7
- from .config import Config
4
+ from ..submission.submission import Submission
5
+ from ..sec.submissions.downloader import download as sec_download
6
+ from ..sec.submissions.textsearch import filter_text
7
+ from ..config import Config
8
8
  import os
9
9
  import tarfile
10
10
  from threading import Lock
11
- from .helper import _process_cik_and_metadata_filters
12
- from .datamule.downloader import download as seclibrary_download
13
- from .sec.xbrl.filter_xbrl import filter_xbrl
14
- from .sec.submissions.monitor import Monitor
15
- from .portfolio_compression_utils import CompressionManager
16
- from .datamule.sec_connector import SecConnector
11
+ from ..helper import _process_cik_and_metadata_filters
12
+ from ..datamule.downloader import download as seclibrary_download
13
+ from ..sec.xbrl.filter_xbrl import filter_xbrl
14
+ from ..sec.submissions.monitor import Monitor
15
+ from .portfolio_compression_utils_legacy import CompressionManager
16
+ from ..datamule.sec_connector import SecConnector
17
17
  import shutil
18
18
 
19
19
 
@@ -31,6 +31,7 @@ class Portfolio:
31
31
 
32
32
  self.monitor = Monitor()
33
33
 
34
+
34
35
  if self.path.exists():
35
36
  self._load_submissions()
36
37
  self.submissions_loaded = True
@@ -47,6 +48,7 @@ class Portfolio:
47
48
  regular_items = [f for f in self.path.iterdir() if (f.is_dir() or f.suffix=='.tar') and 'batch' not in f.name]
48
49
  batch_tars = [f for f in self.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
49
50
 
51
+
50
52
  # Load regular submissions (existing logic)
51
53
  def load_submission(folder):
52
54
  return Submission(folder)
@@ -99,11 +101,12 @@ class Portfolio:
99
101
  try:
100
102
  submission = Submission(
101
103
  batch_tar_path=batch_tar_path,
102
- accession_prefix=accession_prefix,
104
+ accession=accession_prefix,
103
105
  portfolio_ref=self
104
106
  )
105
107
  submissions.append(submission)
106
108
  except Exception as e:
109
+ print(f"Path: {batch_tar_path}. Exception: {e}")
107
110
  pass
108
111
  #print(f"Path: {batch_tar_path}. Exception: {e}")
109
112
  pbar.update(1) # Update progress for each successful submission
@@ -111,18 +114,6 @@ class Portfolio:
111
114
  return submissions
112
115
 
113
116
 
114
- def compress(self, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024):
115
- """
116
- Compress all individual submissions into batch tar files.
117
-
118
- Args:
119
- compression: None, 'gzip', or 'zstd' for document compression (default: None)
120
- compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
121
- threshold: Size threshold for compressing individual documents (default: 1MB)
122
- max_batch_size: Maximum size per batch tar file (default: 1GB)
123
- """
124
- CompressionManager().compress_portfolio(self, compression, compression_level, threshold, max_batch_size, self.MAX_WORKERS)
125
-
126
117
  def decompress(self):
127
118
  """
128
119
  Decompress all batch tar files back to individual submission directories.
@@ -8,6 +8,8 @@ from tqdm import tqdm
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
9
  from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
10
10
 
11
+ # probably can delete much of this TODO
12
+
11
13
 
12
14
  class CompressionManager:
13
15
 
datamule/seclibrary/bq.py CHANGED
@@ -2,6 +2,8 @@ import os
2
2
  import requests
3
3
  import json
4
4
 
5
+ # slated for deletion
6
+
5
7
  def get_information_table(
6
8
  # Optional filtering parameters
7
9
  columns=None,
File without changes
@@ -1,10 +1,10 @@
1
1
  from pathlib import Path
2
2
  import csv
3
3
  import os
4
- from .helper import _process_cik_and_metadata_filters, load_package_dataset
5
- from .sec.xbrl.downloadcompanyfacts import download_company_facts
6
- from .datamule.datamule_lookup import datamule_lookup
7
- from .datamule.datamule_mysql_rds import query_mysql_rds
4
+ from ..helper import _process_cik_and_metadata_filters, load_package_dataset
5
+ from ..sec.xbrl.downloadcompanyfacts import download_company_facts
6
+ from ..datamule.datamule_lookup import datamule_lookup
7
+ from ..datamule.datamule_mysql_rds import query_mysql_rds
8
8
  from company_fundamentals.utils import get_fundamental_mappings
9
9
  from company_fundamentals import construct_fundamentals
10
10
  class Sheet:
File without changes
@@ -1,42 +1,103 @@
1
1
  from pathlib import Path
2
2
  import json
3
- from .document.document import Document
3
+ from ..document.document import Document
4
4
  from secsgml import parse_sgml_content_into_memory
5
5
  from secsgml.parse_sgml import transform_metadata_string
6
6
  from secsgml.utils import bytes_to_str
7
- from .sec.utils import headers
7
+ from ..sec.utils import headers
8
8
  import tarfile
9
- import zstandard as zstd
10
- import gzip
11
9
  import urllib.request
12
10
  from secxbrl import parse_inline_xbrl
13
11
  from company_fundamentals import construct_fundamentals
14
12
  from decimal import Decimal
15
- from .utils.format_accession import format_accession
16
-
13
+ from ..utils.format_accession import format_accession
14
+ from .tar_submission import tar_submission
15
+
16
+ # probably needs rework later
17
+ class FundamentalsAccessor:
18
+ def __init__(self, submission):
19
+ self.submission = submission
20
+ self._cache = {}
21
+ self._all_data = None
22
+
23
+ def __getattr__(self, name):
24
+ # Try as category first
25
+ try:
26
+ if name not in self._cache:
27
+ result = self.submission.parse_fundamentals(categories=[name])
28
+ if result: # Only cache if we got actual data
29
+ self._cache[name] = result
30
+ return result
31
+ except:
32
+ pass
33
+
34
+ # Fall back to dict behavior
35
+ return getattr(self._get_all_data(), name)
36
+
37
+ def _get_all_data(self):
38
+ if self._all_data is None:
39
+ self._all_data = self.submission.parse_fundamentals(categories=None)
40
+ return self._all_data
41
+
42
+ # Make the accessor behave like the underlying data
43
+ def __getitem__(self, key):
44
+ return self._get_all_data()[key]
45
+
46
+ def __repr__(self):
47
+ return repr(self._get_all_data())
48
+
49
+ def __str__(self):
50
+ return str(self._get_all_data())
51
+
52
+ def __iter__(self):
53
+ return iter(self._get_all_data())
54
+
55
+ def __len__(self):
56
+ return len(self._get_all_data()) if self._get_all_data() else 0
57
+
58
+ def __bool__(self):
59
+ return bool(self._get_all_data())
17
60
 
18
61
  class Submission:
19
62
  def __init__(self, path=None, sgml_content=None, keep_document_types=None,
20
- batch_tar_path=None, accession_prefix=None, portfolio_ref=None,url=None):
63
+ batch_tar_path=None, accession=None, portfolio_ref=None,url=None):
21
64
 
65
+ # get accession number
66
+ # lets just use accesion-prefix, to get around malformed metadata files (1995 has a lot!)
67
+ if path is not None:
68
+ self.accession = format_accession(path.stem,'no-dash')
69
+ elif batch_tar_path is not None:
70
+ self.accession = format_accession(accession,'no-dash')
71
+ elif url is not None or sgml_content is not None:
72
+ if accession is None:
73
+ raise ValueError("If using url or sgml_content, accession must be specified.")
74
+ self.accession = format_accession(accession,'no-dash')
75
+ else:
76
+ raise ValueError("If this appears, please post an issue: https://github.com/john-friedman/datamule-python/issues.")
77
+
22
78
 
23
79
  # declare vars to be filled later
24
80
  self._xbrl = None
25
81
  self._fundamentals_cache = {}
82
+ self._tar = None
83
+ self._tar_compression_type = 'zstd'
84
+ self._tar_compression_level = 3
85
+ self._accession_year_2d = None
86
+ self._documents = None
26
87
 
27
88
  # Validate parameters
28
89
  param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path,url])
29
90
  if param_count != 1:
30
91
  raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
31
92
 
32
- if batch_tar_path is not None and (accession_prefix is None or portfolio_ref is None):
33
- raise ValueError("batch_tar_path requires both accession_prefix and portfolio_ref")
93
+ if batch_tar_path is not None and (self.accession is None or portfolio_ref is None):
94
+ raise ValueError("batch_tar_path requires both accession and portfolio_ref")
34
95
 
35
96
  # Initialize batch tar attributes
36
97
  self.batch_tar_path = batch_tar_path
37
- self.accession_prefix = accession_prefix
38
98
  self.portfolio_ref = portfolio_ref
39
99
 
100
+ # here should set accession either from url or make it a required argument if sgml content
40
101
  if url is not None or sgml_content is not None:
41
102
  if url is not None:
42
103
  request = urllib.request.Request(url, headers=headers)
@@ -49,17 +110,15 @@ class Submission:
49
110
 
50
111
  self.path = None
51
112
  metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
52
- metadata = bytes_to_str(metadata)
113
+ metadata = bytes_to_str(metadata,lower=False)
53
114
 
54
115
  # standardize metadata
55
116
  metadata = transform_metadata_string(metadata)
56
117
 
57
118
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
58
- # code dupe
59
- self.accession = self.metadata.content['accession-number']
60
119
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
61
120
 
62
- self.documents = []
121
+ self.documents_obj_list = []
63
122
  filtered_metadata_documents = []
64
123
 
65
124
  for idx,doc in enumerate(self.metadata.content['documents']):
@@ -72,7 +131,7 @@ class Submission:
72
131
  # write as txt if not declared
73
132
  filename = doc.get('filename','.txt')
74
133
  extension = Path(filename).suffix
75
- self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
134
+ self.documents_obj_list.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
76
135
 
77
136
  filtered_metadata_documents.append(doc)
78
137
 
@@ -85,24 +144,22 @@ class Submission:
85
144
  # Load metadata from batch tar
86
145
  with self.portfolio_ref.batch_tar_locks[batch_tar_path]:
87
146
  tar_handle = self.portfolio_ref.batch_tar_handles[batch_tar_path]
88
- metadata_obj = tar_handle.extractfile(f'{accession_prefix}/metadata.json')
147
+ metadata_obj = tar_handle.extractfile(f'{self.accession}/metadata.json')
89
148
  metadata = json.loads(metadata_obj.read().decode('utf-8'))
90
149
 
91
150
  # Set metadata path using :: notation
92
- metadata_path = f"{batch_tar_path}::{accession_prefix}/metadata.json"
151
+ metadata_path = f"{batch_tar_path}::{self.accession}/metadata.json"
93
152
 
94
153
  # standardize metadata
95
154
  metadata = transform_metadata_string(metadata)
96
155
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
97
-
98
- # lets just use accesion-prefix, to get around malformed metadata files (1995 has a lot!)
99
- self.accession = format_accession(self.accession_prefix,'dash')
100
156
 
101
- #print(f"s: {self.metadata.content['accession-number']} : {batch_tar_path}")
102
157
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
103
158
 
104
159
  elif path is not None:
105
160
  self.path = Path(path)
161
+
162
+
106
163
  if self.path.suffix == '.tar':
107
164
  with tarfile.open(self.path,'r') as tar:
108
165
  metadata_obj = tar.extractfile('metadata.json')
@@ -118,65 +175,45 @@ class Submission:
118
175
  # standardize metadata
119
176
  metadata = transform_metadata_string(metadata)
120
177
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
121
- self.accession = self.metadata.content['accession-number']
122
178
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
123
179
 
124
180
 
125
181
  # booleans
126
- self._has_xbrl = any(
182
+ self._xbrl_bool = any(
127
183
  doc['type'] in ('EX-100.INS', 'EX-101.INS') or
128
184
  doc.get('filename', '').endswith('_htm.xml')
129
185
  for doc in self.metadata.content['documents']
130
186
  )
131
187
 
132
- self._has_fundamentals = self._has_xbrl
188
+ self._has_fundamentals = self._xbrl_bool
133
189
 
190
+
191
+ # TODO rework for better metadata accessing
134
192
  def _load_document_by_index(self, idx):
135
193
  """Load a document by its index in the metadata documents list."""
136
194
  doc = self.metadata.content['documents'][idx]
137
195
 
138
196
  # If loaded from sgml_content, return pre-loaded document
139
197
  if self.path is None and self.batch_tar_path is None:
140
- return self.documents[idx]
198
+ return self.documents_obj_list[idx]
141
199
 
142
200
  # Get filename from metadata - this is the source of truth
143
201
  filename = doc.get('filename')
144
202
  if filename is None:
145
203
  filename = doc['sequence'] + '.txt'
146
204
 
147
- # Get the base extension (before any compression extension)
148
- # If filename ends with .gz or .zst, the real extension is before that
149
- if filename.endswith('.gz'):
150
- extension = Path(filename[:-3]).suffix
151
- is_compressed = 'gzip'
152
- elif filename.endswith('.zst'):
153
- extension = Path(filename[:-4]).suffix
154
- is_compressed = 'zstd'
155
- else:
156
- extension = Path(filename).suffix
157
- is_compressed = False
158
-
205
+ extension = Path(filename).suffix
159
206
  # Handle batch tar case
160
207
  if self.batch_tar_path is not None:
161
208
  with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
162
209
  tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
163
210
 
164
211
  # Use exact filename from metadata
165
- tar_path = f'{self.accession_prefix}/{filename}'
212
+ tar_path = f'{self.accession}/{filename}'
166
213
  content = tar_handle.extractfile(tar_path).read()
167
214
 
168
215
 
169
- # Decompress if needed based on filename extension
170
- if is_compressed == 'gzip':
171
- content = gzip.decompress(content)
172
- elif is_compressed == 'zstd':
173
- content = zstd.ZstdDecompressor().decompress(content)
174
-
175
- # Decode text files
176
- # if extension in ['.htm', '.html', '.txt', '.xml']:
177
- # content = content.decode('utf-8', errors='replace')
178
-
179
- document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
216
+ document_path = f"{self.batch_tar_path}::{self.accession}/{filename}"
180
217
 
181
218
  # Handle regular path case
182
219
  else:
@@ -188,27 +225,7 @@ class Submission:
188
225
  content = tar.extractfile(filename).read()
189
226
  actual_filename = filename
190
227
  except:
191
- try:
192
- content = tar.extractfile(filename + '.gz').read()
193
- actual_filename = filename + '.gz'
194
- is_compressed = 'gzip'
195
- except:
196
- try:
197
- content = tar.extractfile(filename + '.zst').read()
198
- actual_filename = filename + '.zst'
199
- is_compressed = 'zstd'
200
- except:
201
- raise FileNotFoundError(f"Document file not found in tar: {filename}")
202
-
203
- # Decompress if compressed
204
- if is_compressed == 'gzip':
205
- content = gzip.decompress(content)
206
- elif is_compressed == 'zstd':
207
- content = zstd.ZstdDecompressor().decompress(content)
208
-
209
- # Decode text files
210
- # if extension in ['.htm', '.html', '.txt', '.xml']:
211
- # content = content.decode('utf-8', errors='replace')
228
+ raise FileNotFoundError(f"Document file not found in tar: {filename}")
212
229
 
213
230
  document_path = f"{self.path}::{actual_filename}"
214
231
 
@@ -222,15 +239,6 @@ class Submission:
222
239
  with document_path.open('rb') as f:
223
240
  content = f.read()
224
241
 
225
- # Decompress if needed based on filename extension
226
- if is_compressed == 'gzip':
227
- content = gzip.decompress(content)
228
- elif is_compressed == 'zstd':
229
- content = zstd.ZstdDecompressor().decompress(content)
230
-
231
- # Decode text files
232
- # if extension in ['.htm', '.html', '.txt', '.xml']:
233
- # content = content.decode('utf-8', errors='replace')
234
242
 
235
243
  return Document(
236
244
  type=doc['type'],
@@ -260,20 +268,24 @@ class Submission:
260
268
  def parse_xbrl(self):
261
269
  if self._xbrl:
262
270
  return
263
-
271
+
272
+ if not self._xbrl_bool:
273
+ print(f"Submission: {self.accession} has no xbrl")
274
+ return
275
+
264
276
  for idx, doc in enumerate(self.metadata.content['documents']):
265
277
  if doc['type'] in ['EX-100.INS','EX-101.INS']:
266
278
  document = self._load_document_by_index(idx)
267
279
  self._xbrl = parse_inline_xbrl(content=document.content,file_type='extracted_inline')
268
- return
269
-
280
+ return
281
+
270
282
  if doc['filename'].endswith('_htm.xml'):
271
283
  document = self._load_document_by_index(idx)
272
284
  self._xbrl = parse_inline_xbrl(content=document.content,file_type='extracted_inline')
273
285
  return
274
286
 
275
287
  @property
276
- def xbrl(self):
288
+ def xbrl(self):
277
289
  if self._xbrl is None:
278
290
  self.parse_xbrl()
279
291
  return self._xbrl
@@ -353,20 +365,61 @@ class Submission:
353
365
 
354
366
  @property
355
367
  def fundamentals(self):
356
- """Get all fundamental data"""
357
- return self.parse_fundamentals(categories=None)
368
+ """Access fundamentals via attributes: sub.fundamentals.incomeStatement"""
369
+ if not hasattr(self, '_fundamentals_accessor'):
370
+ self._fundamentals_accessor = FundamentalsAccessor(self)
371
+ return self._fundamentals_accessor
372
+
373
+ @property
374
+ def tar(self):
375
+ return self._tar_submission().getvalue()
376
+
377
+ def set_tar_compression(self,compression_type='zstd',level=3):
378
+ self._tar_compression_type = compression_type
379
+ self._tar_compression_level = level
380
+
381
+ def _tar_submission(self):
382
+ if self._tar is not None:
383
+ return self._tar
384
+ else:
385
+ documents_obj_list = self._get_documents_obj_list()
386
+ self._tar = tar_submission(
387
+ documents_obj_list=documents_obj_list,
388
+ metadata=self.metadata.content,
389
+ compression_type=self._tar_compression_type,
390
+ level=self._tar_compression_level
391
+ )
392
+ return self._tar
393
+
394
+ @property
395
+ def accession_year_2d(self):
396
+ return self._get_accession_year_2d()
397
+
398
+ def _get_accession_year_2d(self):
399
+ if self._accession_year_2d is not None:
400
+ return self._accession_year_2d
401
+ self._accession_year_2d = format_accession(self.accession,'dash').split('-')[1]
402
+ return self._accession_year_2d
403
+
404
+ @property
405
+ def documents(self):
406
+ return self._get_documents()
407
+
408
+ def _get_documents(self):
409
+ if self._documents is not None:
410
+ return self._documents
411
+
412
+ self._documents = self.metadata.content['documents']
413
+ return self._documents
358
414
 
359
- def __getattr__(self, name):
360
- # Check if it's a fundamentals property request
361
- if name.endswith('_fundamentals'):
362
- category = name.replace('_fundamentals', '')
363
- return self.parse_fundamentals(categories=[category])
415
+ def _get_documents_obj_list(self):
416
+ """Get all documents as Document objects"""
417
+ if hasattr(self, 'documents_obj_list'):
418
+ return self.documents_obj_list
364
419
 
365
- # For any other unknown attribute, try it as a fundamentals category
366
- # Let parse_fundamentals handle whether it's valid or not
367
- result = self.parse_fundamentals(categories=[name])
368
- if result is not None:
369
- return result
420
+ # Generate documents_obj_list for batch tar and path cases
421
+ documents_obj_list = []
422
+ for idx in range(len(self.metadata.content['documents'])):
423
+ documents_obj_list.append(self._load_document_by_index(idx))
370
424
 
371
- # Only raise AttributeError if parse_fundamentals returns None/empty
372
- raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
425
+ return documents_obj_list
@@ -0,0 +1,79 @@
1
+ import zstandard as zstd
2
+ from secsgml.utils import calculate_documents_locations_in_tar
3
+ import tarfile
4
+ import io
5
+ import json
6
+
7
+ # Note: we don't actually need accession at this level. TODO
8
+
9
+ def compress_content(content, compression_type, level):
10
+ if compression_type == 'zstd':
11
+ # Create compressor with specified level
12
+ compressor = zstd.ZstdCompressor(level=level)
13
+
14
+ # Handle string content
15
+ # This should never be called
16
+ if isinstance(content, str):
17
+ content_bytes = content.encode('utf-8')
18
+ else:
19
+ content_bytes = content
20
+
21
+ # Compress and return
22
+ return compressor.compress(content_bytes)
23
+
24
+ # Return uncompressed if not zstd
25
+ return content
26
+
27
+ def compress_content_list(document_tuple_list, compression_type, level):
28
+ if compression_type is None:
29
+ return document_tuple_list
30
+
31
+ if level is None:
32
+ level = 3
33
+
34
+ # Create new list to avoid modifying original
35
+ compressed_list = []
36
+ for document_tuple in document_tuple_list:
37
+ content = document_tuple[0]
38
+ accession = document_tuple[1]
39
+ compressed_content = compress_content(content, compression_type, level)
40
+ compressed_list.append((compressed_content, accession))
41
+
42
+ return compressed_list
43
+
44
+ def tar_content_list(metadata, document_tuple_list_compressed):
45
+ # Update metadata with compressed sizes
46
+ for i, (content, accession) in enumerate(document_tuple_list_compressed):
47
+ metadata['documents'][i]['secsgml_size_bytes'] = len(content)
48
+
49
+ metadata = calculate_documents_locations_in_tar(metadata)
50
+
51
+ tar_buffer = io.BytesIO()
52
+ with tarfile.open(fileobj=tar_buffer, mode='w') as tar:
53
+ # Add metadata first
54
+ metadata_json = json.dumps(metadata).encode('utf-8')
55
+ tarinfo = tarfile.TarInfo(f'metadata.json')
56
+ tarinfo.size = len(metadata_json)
57
+ tar.addfile(tarinfo, io.BytesIO(metadata_json))
58
+
59
+ # Add each content
60
+ for i, (content, accession) in enumerate(document_tuple_list_compressed):
61
+ doc = metadata['documents'][i]
62
+ filename = doc.get('filename', doc['sequence'] + '.txt')
63
+
64
+ tarinfo = tarfile.TarInfo(name=filename)
65
+ tarinfo.size = len(content)
66
+ tar.addfile(tarinfo, io.BytesIO(content))
67
+
68
+ # Return the tar buffer
69
+ tar_buffer.seek(0) # Reset buffer position to beginning
70
+ return tar_buffer
71
+
72
+ def tar_submission(metadata, documents_obj_list, compression_type=None, level=None):
73
+ """Takes a list of documents, compresses them, then tars them."""
74
+ document_tuple_list = [(doc.content, doc.accession) for doc in documents_obj_list]
75
+ document_tuple_list_compressed = compress_content_list(document_tuple_list, # Fixed: correct parameter name
76
+ compression_type=compression_type,
77
+ level=level)
78
+
79
+ return tar_content_list(metadata, document_tuple_list_compressed)
File without changes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.3.5
3
+ Version: 2.3.7
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -1,16 +1,13 @@
1
- datamule/__init__.py,sha256=gsWTW0emwGtM-KVtwe2OICVmW7ImvLvP0SORULTPe-Y,1220
1
+ datamule/__init__.py,sha256=fy8h9IQQqSqOvRXJ6Q7Q-8nWwdnw2THP6puqfGkIB4k,1278
2
2
  datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
3
  datamule/datasets.py,sha256=1A9PPPyLIQ51evXLSsiKmVxNmjbO6c2FGszrairREjc,2058
4
4
  datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
5
5
  datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
6
6
  datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
7
- datamule/portfolio.py,sha256=0-E1ZSEjJ8hba7HxF8oCrRneNuF_KKISOY6K4dRg0Cg,12282
8
- datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
9
- datamule/sheet.py,sha256=KD7yAgSB8BE-Z4GDuH58IV-2DJ673nMcEsrCyJbeYp8,10707
10
- datamule/submission.py,sha256=phHmi9ScjWHtVLjEoEdAO7RieUSKN5gPr0onfg5R8wE,16139
11
7
  datamule/book/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
8
  datamule/book/book.py,sha256=Vw33JHhmulNDWRN2AQpUQrf8wgVqqUYg5QJgbKhBNak,773
13
- datamule/book/s3transfer.py,sha256=4Zpw5daAH05u1dppv2ARXG_VSBIdsHnlEWC9xZgBfZM,12590
9
+ datamule/book/s3transfer.py,sha256=arftLhYThLSGvmBSNnU2rNpkqiyvwAL32OVAKP4HOAQ,12596
10
+ datamule/cloud/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
11
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
15
12
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
13
  datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
@@ -18,22 +15,14 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
18
15
  datamule/datamule/downloader.py,sha256=Ss9mz0Jf5UAd-CZJ6oO96o9hN04xMQIF3-e1wahokdM,18581
19
16
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
20
17
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- datamule/document/document.py,sha256=ooTbMpB_cBeONhtic8bNE4ISWltQIxQz4LLuXcGe8xc,23015
22
- datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- datamule/document/tables/tables.py,sha256=Z3Eu6bdjiaNx4pgXlTMwk2Q-DhpMpEAygF2kJdp-Pu8,5722
24
- datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
25
- datamule/document/tables/tables_25nse.py,sha256=kpoOcIpra6i3Wx_6pUCj1fkx0wUbMhx7pc8yUkrBJb4,980
26
- datamule/document/tables/tables_informationtable.py,sha256=3yjuxYuLoBjRd6O0BNd0jQDmS1XUDjA6xp51Csq2cH8,649
27
- datamule/document/tables/tables_npx.py,sha256=tZDBAonAQWLsgecVK_OwIgNcUJhuV5L2gkTSNbXAgNE,6652
28
- datamule/document/tables/tables_ownership.py,sha256=pRoFFRGLWp8gKAAvvUbVRxIU2xDFAQhwi9bgwddsT8A,11185
29
- datamule/document/tables/tables_proxyvotingrecord.py,sha256=S_Th294-KWRL-QVXkexNWIksSaFePZGSVq6EU8iiK0o,896
30
- datamule/document/tables/tables_sbsef.py,sha256=X6VKVnAdWxn2TgRmaAd1WWlxPhcLPQ-53s0qDokkPI0,635
31
- datamule/document/tables/tables_sdr.py,sha256=BwHRJvtijiYvNJ2lIc_30kct6VEmLimIzX28JjZBBqo,4924
32
- datamule/document/tables/utils.py,sha256=2-X_1NsiWj_XsD9djxCXwTeIVlg-ip78gG11xACJiDs,738
18
+ datamule/document/document.py,sha256=Oj_7OMIldWB9HxlBca2gqr5E8ykDQZkPuUlcZjGuzqw,23016
33
19
  datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
20
  datamule/mapping_dicts/html_mapping_dicts.py,sha256=pba3utMr2KldPeEGnMRkHyVw7D2WHSDpg_5u36pHMII,5411
35
21
  datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
36
22
  datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
23
+ datamule/portfolio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ datamule/portfolio/portfolio.py,sha256=YPIvS4KKuEtm8A1XvNqDF39f4LJHhAFWmtpJzjbGDhY,11680
25
+ datamule/portfolio/portfolio_compression_utils_legacy.py,sha256=1nlbz7JfBDrI0pwTyFiBF856xqGXvQRYBulLUpk7G1A,12695
37
26
  datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
27
  datamule/sec/utils.py,sha256=96bavyG2Kq1t8L1YA2vwYnAHKIKdRSoVXxBO5QH1HWo,2196
39
28
  datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,8 +39,23 @@ datamule/sec/xbrl/filter_xbrl.py,sha256=QiSfm7tsJVLIw2PFqGh8D01qsRe_ZB-mbFhr6KcB
50
39
  datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H5N2tbvKUzM,3307
51
40
  datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
52
41
  datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
54
- datamule/sentiment/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
+ datamule/seclibrary/bq.py,sha256=TOP0WA6agDKu4vE1eHd62NDpAc02LDDrOP-g1bJpxbw,18048
43
+ datamule/sheet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
+ datamule/sheet/sheet.py,sha256=Dw979JGygS566N0Iwsvqk0h1s26GfbrIHDWiBaS2oH8,10711
45
+ datamule/submission/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
+ datamule/submission/submission.py,sha256=4UtdViw-h_4Rqt09SFe8-WWdLqaD55T3vqTUVRB0CsE,17058
47
+ datamule/submission/tar_submission.py,sha256=lkm1neVLW2_-G26VylL6Rzx98Cavvml0Qd2wlJHD0bw,3075
48
+ datamule/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
+ datamule/tables/tables.py,sha256=Z3Eu6bdjiaNx4pgXlTMwk2Q-DhpMpEAygF2kJdp-Pu8,5722
50
+ datamule/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
51
+ datamule/tables/tables_25nse.py,sha256=kpoOcIpra6i3Wx_6pUCj1fkx0wUbMhx7pc8yUkrBJb4,980
52
+ datamule/tables/tables_informationtable.py,sha256=3yjuxYuLoBjRd6O0BNd0jQDmS1XUDjA6xp51Csq2cH8,649
53
+ datamule/tables/tables_npx.py,sha256=tZDBAonAQWLsgecVK_OwIgNcUJhuV5L2gkTSNbXAgNE,6652
54
+ datamule/tables/tables_ownership.py,sha256=pRoFFRGLWp8gKAAvvUbVRxIU2xDFAQhwi9bgwddsT8A,11185
55
+ datamule/tables/tables_proxyvotingrecord.py,sha256=S_Th294-KWRL-QVXkexNWIksSaFePZGSVq6EU8iiK0o,896
56
+ datamule/tables/tables_sbsef.py,sha256=X6VKVnAdWxn2TgRmaAd1WWlxPhcLPQ-53s0qDokkPI0,635
57
+ datamule/tables/tables_sdr.py,sha256=BwHRJvtijiYvNJ2lIc_30kct6VEmLimIzX28JjZBBqo,4924
58
+ datamule/tables/utils.py,sha256=2-X_1NsiWj_XsD9djxCXwTeIVlg-ip78gG11xACJiDs,738
55
59
  datamule/tags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
60
  datamule/tags/config.py,sha256=rxawvOBDT2v72Aw-VkmnUOLsKSAIrZBrjz_E0hPU7MY,1677
57
61
  datamule/tags/dictionaries.py,sha256=1v2OoN1KnM3HbFHxATxe7LhVRoXe64ecRRgA3oak210,4587
@@ -61,7 +65,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
65
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
62
66
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
63
67
  datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
64
- datamule-2.3.5.dist-info/METADATA,sha256=8KXiAyLcn5aVKF38N4H7fWYnooUPPkIujAfqoOr658k,609
65
- datamule-2.3.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
66
- datamule-2.3.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
67
- datamule-2.3.5.dist-info/RECORD,,
68
+ datamule-2.3.7.dist-info/METADATA,sha256=1Igs40zdVpr6XPH4s2ToG5EIyAsI1lpdA1yiuGUjsx4,609
69
+ datamule-2.3.7.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
70
+ datamule-2.3.7.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
71
+ datamule-2.3.7.dist-info/RECORD,,
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes