datamule 1.5.2__tar.gz → 1.5.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {datamule-1.5.2 → datamule-1.5.4}/PKG-INFO +1 -1
  2. {datamule-1.5.2 → datamule-1.5.4}/datamule/portfolio.py +8 -4
  3. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/submissions/downloader.py +3 -2
  4. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/submissions/monitor.py +42 -22
  5. {datamule-1.5.2 → datamule-1.5.4}/datamule/seclibrary/downloader.py +97 -8
  6. {datamule-1.5.2 → datamule-1.5.4}/datamule/submission.py +123 -45
  7. {datamule-1.5.2 → datamule-1.5.4}/datamule.egg-info/PKG-INFO +1 -1
  8. {datamule-1.5.2 → datamule-1.5.4}/setup.py +1 -1
  9. {datamule-1.5.2 → datamule-1.5.4}/datamule/__init__.py +0 -0
  10. {datamule-1.5.2 → datamule-1.5.4}/datamule/config.py +0 -0
  11. {datamule-1.5.2 → datamule-1.5.4}/datamule/data/listed_filer_metadata.csv +0 -0
  12. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/__init__.py +0 -0
  13. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/document.py +0 -0
  14. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/__init__.py +0 -0
  15. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/atsn.py +0 -0
  16. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/cfportal.py +0 -0
  17. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/d.py +0 -0
  18. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/ex102_abs.py +0 -0
  19. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/ex99a_sdr.py +0 -0
  20. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/ex99c_sdr.py +0 -0
  21. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/ex99g_sdr.py +0 -0
  22. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/ex99i_sdr.py +0 -0
  23. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/information_table.py +0 -0
  24. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/nmfp.py +0 -0
  25. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/npx.py +0 -0
  26. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/onefourtyfour.py +0 -0
  27. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/ownership.py +0 -0
  28. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/proxy_voting_record.py +0 -0
  29. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/sbs.py +0 -0
  30. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/sbsef.py +0 -0
  31. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/schedule13.py +0 -0
  32. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/sdr.py +0 -0
  33. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/submission_metadata.py +0 -0
  34. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/ta.py +0 -0
  35. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/thirteenfhr.py +0 -0
  36. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/twentyfivense.py +0 -0
  37. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/mappings/twentyfourf2nt.py +0 -0
  38. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/processing.py +0 -0
  39. {datamule-1.5.2 → datamule-1.5.4}/datamule/document/table.py +0 -0
  40. {datamule-1.5.2 → datamule-1.5.4}/datamule/helper.py +0 -0
  41. {datamule-1.5.2 → datamule-1.5.4}/datamule/index.py +0 -0
  42. {datamule-1.5.2 → datamule-1.5.4}/datamule/mapping_dicts/__init__.py +0 -0
  43. {datamule-1.5.2 → datamule-1.5.4}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  44. {datamule-1.5.2 → datamule-1.5.4}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  45. {datamule-1.5.2 → datamule-1.5.4}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  46. {datamule-1.5.2 → datamule-1.5.4}/datamule/package_updater.py +0 -0
  47. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/__init__.py +0 -0
  48. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/infrastructure/__init__.py +0 -0
  49. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  50. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/submissions/__init__.py +0 -0
  51. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/submissions/eftsquery.py +0 -0
  52. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/submissions/streamer.py +0 -0
  53. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/submissions/textsearch.py +0 -0
  54. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/utils.py +0 -0
  55. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/xbrl/__init__.py +0 -0
  56. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  57. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  58. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  59. {datamule-1.5.2 → datamule-1.5.4}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  60. {datamule-1.5.2 → datamule-1.5.4}/datamule/seclibrary/__init__.py +0 -0
  61. {datamule-1.5.2 → datamule-1.5.4}/datamule/seclibrary/bq.py +0 -0
  62. {datamule-1.5.2 → datamule-1.5.4}/datamule/seclibrary/query.py +0 -0
  63. {datamule-1.5.2 → datamule-1.5.4}/datamule/sheet.py +0 -0
  64. {datamule-1.5.2 → datamule-1.5.4}/datamule.egg-info/SOURCES.txt +0 -0
  65. {datamule-1.5.2 → datamule-1.5.4}/datamule.egg-info/dependency_links.txt +0 -0
  66. {datamule-1.5.2 → datamule-1.5.4}/datamule.egg-info/requires.txt +0 -0
  67. {datamule-1.5.2 → datamule-1.5.4}/datamule.egg-info/top_level.txt +0 -0
  68. {datamule-1.5.2 → datamule-1.5.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.5.2
3
+ Version: 1.5.4
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -34,7 +34,6 @@ class Portfolio:
34
34
 
35
35
  def _load_submissions(self):
36
36
  folders = [f for f in self.path.iterdir() if f.is_dir() or f.suffix=='.tar']
37
- print(folders)
38
37
  print(f"Loading {len(folders)} submissions")
39
38
 
40
39
  def load_submission(folder):
@@ -126,7 +125,8 @@ class Portfolio:
126
125
  # First query, just set the accession numbers
127
126
  self.accession_numbers = new_accession_numbers
128
127
 
129
- def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],requests_per_second=5, **kwargs):
128
+ def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],
129
+ requests_per_second=5,keep_filtered_metadata=False,standardize_metadata=True, **kwargs):
130
130
  if provider is None:
131
131
  config = Config()
132
132
  provider = config.get_default_source()
@@ -143,7 +143,9 @@ class Portfolio:
143
143
  submission_type=submission_type,
144
144
  filing_date=filing_date,
145
145
  accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
146
- keep_document_types=document_type
146
+ keep_document_types=document_type,
147
+ keep_filtered_metadata=keep_filtered_metadata,
148
+ standardize_metadata=standardize_metadata,
147
149
  )
148
150
  else:
149
151
  sec_download(
@@ -153,7 +155,9 @@ class Portfolio:
153
155
  filing_date=filing_date,
154
156
  requests_per_second=requests_per_second,
155
157
  accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
156
- keep_document_types=document_type
158
+ keep_document_types=document_type,
159
+ keep_filtered_metadata=keep_filtered_metadata,
160
+ standardize_metadata=standardize_metadata,
157
161
  )
158
162
 
159
163
  self.submissions_loaded = False
@@ -5,7 +5,7 @@ from tqdm import tqdm
5
5
 
6
6
  def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
7
7
  requests_per_second=5, output_dir="filings", accession_numbers=None,
8
- quiet=False, keep_document_types=[]):
8
+ quiet=False, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
9
9
  # Make sure output directory exists
10
10
  os.makedirs(output_dir, exist_ok=True)
11
11
 
@@ -14,7 +14,8 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
14
14
  # Create a wrapper for the download_callback that includes the output_dir
15
15
  async def callback_wrapper(hit, content, cik, accno, url):
16
16
  output_path = os.path.join(output_dir, accno.replace('-','') + '.tar')
17
- write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=keep_document_types)
17
+ write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=keep_document_types,keep_filtered_metadata=keep_filtered_metadata,
18
+ standardize_metadata=standardize_metadata)
18
19
  pbar.update(1)
19
20
 
20
21
 
@@ -77,12 +77,19 @@ class Monitor():
77
77
  )
78
78
 
79
79
  async def _async_monitor_submissions(self, data_callback=None, interval_callback=None,
80
- polling_interval=1000, quiet=True, start_date=None,
81
- validation_interval=60000):
80
+ polling_interval=1000, quiet=True, start_date=None,
81
+ validation_interval=60000):
82
82
  """
83
83
  Async implementation of monitor_submissions.
84
+ Either polling_interval or validation_interval (or both) must be specified.
85
+ If polling_interval is None, only EFTS validation will be performed.
86
+ If validation_interval is None, only RSS polling will be performed.
84
87
  """
85
88
 
89
+ # Validate that at least one interval is specified
90
+ if polling_interval is None and validation_interval is None:
91
+ raise ValueError("At least one of polling_interval or validation_interval must be specified")
92
+
86
93
  # Backfill if start_date is provided
87
94
  if start_date is not None:
88
95
  today_date = datetime.now().date().strftime('%Y-%m-%d')
@@ -100,24 +107,33 @@ class Monitor():
100
107
  if new_hits and data_callback:
101
108
  data_callback(new_hits)
102
109
 
103
- last_polling_time = time.time()
104
- last_validation_time = last_polling_time
105
- current_time = last_polling_time
106
-
110
+ # Initialize timing variables
111
+ current_time = time.time()
112
+ last_polling_time = current_time
113
+ last_validation_time = current_time
114
+
115
+ # Determine which operations to perform
116
+ do_polling = polling_interval is not None
117
+ do_validation = validation_interval is not None
118
+
107
119
  while True:
108
- # RSS polling
109
- if not quiet:
110
- print(f"Polling RSS feed")
111
- results = await poll_rss(self.ratelimiters['sec.gov'])
112
- new_results = self._filter_new_accessions(results)
113
- if new_results:
120
+ current_time = time.time()
121
+
122
+ # RSS polling (if enabled)
123
+ if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
114
124
  if not quiet:
115
- print(f"Found {len(new_results)} new submissions via RSS")
116
- if data_callback:
117
- data_callback(new_results)
125
+ print(f"Polling RSS feed")
126
+ results = await poll_rss(self.ratelimiters['sec.gov'])
127
+ new_results = self._filter_new_accessions(results)
128
+ if new_results:
129
+ if not quiet:
130
+ print(f"Found {len(new_results)} new submissions via RSS")
131
+ if data_callback:
132
+ data_callback(new_results)
133
+ last_polling_time = current_time
118
134
 
119
- # EFTS validation
120
- if validation_interval and (current_time - last_validation_time) >= validation_interval/1000:
135
+ # EFTS validation (if enabled)
136
+ if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
121
137
  # Get submissions from the last 24 hours for validation
122
138
  today_date = datetime.now().strftime('%Y-%m-%d')
123
139
  if not quiet:
@@ -134,19 +150,23 @@ class Monitor():
134
150
  print(f"Found {len(new_hits)} new submissions via EFTS validation")
135
151
  if data_callback:
136
152
  data_callback(new_hits)
137
- last_polling_time = time.time()
138
153
  last_validation_time = current_time
139
154
 
140
155
  # Interval callback
141
156
  if interval_callback:
142
157
  interval_callback()
143
158
 
144
- next_poll_time = last_polling_time + (polling_interval / 1000)
159
+ # Calculate next wake-up time
160
+ next_times = []
161
+ if do_polling:
162
+ next_times.append(last_polling_time + (polling_interval / 1000))
163
+ if do_validation:
164
+ next_times.append(last_validation_time + (validation_interval / 1000))
165
+
166
+ next_wake_time = min(next_times)
145
167
  current_time = time.time()
146
- time_to_sleep = max(0, next_poll_time - current_time)
168
+ time_to_sleep = max(0, next_wake_time - current_time)
147
169
  await asyncio.sleep(time_to_sleep)
148
- last_polling_time = next_poll_time
149
-
150
170
 
151
171
  def monitor_submissions(self, data_callback=None, interval_callback=None,
152
172
  polling_interval=1000, quiet=True, start_date=None,
@@ -74,7 +74,7 @@ class Downloader:
74
74
  print(f"Failed to log error to {error_file}: {str(e)}")
75
75
 
76
76
  class FileProcessor:
77
- def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[]):
77
+ def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[], keep_filtered_metadata=False,standardize_metadata=True):
78
78
  self.processing_queue = Queue(maxsize=queue_size)
79
79
  self.should_stop = False
80
80
  self.processing_workers = []
@@ -84,6 +84,8 @@ class Downloader:
84
84
  self.pbar = pbar
85
85
  self.downloader = downloader
86
86
  self.keep_document_types = keep_document_types
87
+ self.keep_filtered_metadata = keep_filtered_metadata
88
+ self.standardize_metadata = standardize_metadata
87
89
 
88
90
  def start_processing_workers(self):
89
91
  for _ in range(self.max_workers):
@@ -95,7 +97,8 @@ class Downloader:
95
97
  def _process_file(self, item):
96
98
  filename, content = item
97
99
  output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
98
- write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types)
100
+ write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types, keep_filtered_metadata=self.keep_filtered_metadata,standardize_metadata=self.standardize_metadata)
101
+
99
102
  self.pbar.update(1)
100
103
 
101
104
  def _processing_worker(self):
@@ -204,11 +207,12 @@ class Downloader:
204
207
  except Exception as e:
205
208
  self._log_error(output_dir, filename, str(e))
206
209
 
207
- async def process_batch(self, urls, output_dir, keep_document_types=[]):
210
+ async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
208
211
  os.makedirs(output_dir, exist_ok=True)
209
212
 
210
213
  with tqdm(total=len(urls), desc="Processing files") as pbar:
211
- processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types)
214
+ processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types,
215
+ keep_filtered_metadata=keep_filtered_metadata,standardize_metadata=standardize_metadata)
212
216
  processor.start_processing_workers()
213
217
 
214
218
  semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
@@ -231,7 +235,7 @@ class Downloader:
231
235
  processor.stop_workers()
232
236
  decompression_pool.shutdown()
233
237
 
234
- def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
238
+ def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
235
239
  """
236
240
  Query SEC filings and download/process them.
237
241
 
@@ -242,6 +246,7 @@ class Downloader:
242
246
  - output_dir: Directory to save downloaded files
243
247
  - accession_numbers: List of specific accession numbers to download
244
248
  - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
249
+ - keep_filtered_metadata: Whether to keep metadata for filtered documents
245
250
  """
246
251
  if self.api_key is None:
247
252
  raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
@@ -279,7 +284,7 @@ class Downloader:
279
284
  start_time = time.time()
280
285
 
281
286
  # Process the batch asynchronously
282
- asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types))
287
+ asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata))
283
288
 
284
289
  # Calculate and display performance metrics
285
290
  elapsed_time = time.time() - start_time
@@ -292,7 +297,65 @@ class Downloader:
292
297
  self.loop.call_soon_threadsafe(self.loop.stop)
293
298
 
294
299
 
295
- def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
300
+
301
+ def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
302
+ """
303
+ Download and process SEC filings using specific filenames.
304
+
305
+ Parameters:
306
+ - filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
307
+ - output_dir: Directory to save downloaded files
308
+ - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
309
+ - keep_filtered_metadata: Whether to keep metadata for filtered documents
310
+ - standardize_metadata: Whether to standardize metadata format
311
+ """
312
+ if self.api_key is None:
313
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
314
+
315
+ if not filenames:
316
+ raise ValueError("No filenames provided")
317
+
318
+ if not isinstance(filenames, (list, tuple)):
319
+ filenames = [filenames]
320
+
321
+ # Validate filenames format
322
+ for filename in filenames:
323
+ if not isinstance(filename, str):
324
+ raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
325
+ if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
326
+ raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
327
+
328
+ # Generate URLs directly from filenames
329
+ print(f"Generating URLs for {len(filenames)} files...")
330
+ urls = []
331
+ for filename in filenames:
332
+ url = f"{self.BASE_URL}{filename}"
333
+ urls.append(url)
334
+
335
+ # Remove duplicates while preserving order
336
+ seen = set()
337
+ urls = [url for url in urls if not (url in seen or seen.add(url))]
338
+
339
+ print(f"Downloading {len(urls)} files...")
340
+
341
+ # Process the batch asynchronously using existing infrastructure
342
+ start_time = time.time()
343
+
344
+ asyncio.run(self.process_batch(
345
+ urls,
346
+ output_dir,
347
+ keep_document_types=keep_document_types,
348
+ keep_filtered_metadata=keep_filtered_metadata,
349
+ standardize_metadata=standardize_metadata
350
+ ))
351
+
352
+ # Calculate and display performance metrics
353
+ elapsed_time = time.time() - start_time
354
+ print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
355
+ print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
356
+
357
+
358
+ def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
296
359
  """
297
360
  Query SEC filings and download/process them.
298
361
 
@@ -304,6 +367,7 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
304
367
  - output_dir: Directory to save downloaded files
305
368
  - accession_numbers: List of specific accession numbers to download
306
369
  - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
370
+ - keep_filtered_metadata: Whether to keep metadata for filtered documents
307
371
  """
308
372
  if accession_numbers:
309
373
  accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
@@ -317,5 +381,30 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
317
381
  filing_date=filing_date,
318
382
  output_dir=output_dir,
319
383
  accession_numbers=accession_numbers,
320
- keep_document_types=keep_document_types
384
+ keep_document_types=keep_document_types,
385
+ keep_filtered_metadata=keep_filtered_metadata,
386
+ standardize_metadata=standardize_metadata
387
+ )
388
+
389
+ def download_files_using_filename(filenames, api_key=None, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
390
+ """
391
+ Download and process SEC filings using specific filenames.
392
+
393
+ Parameters:
394
+ - filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
395
+ - api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
396
+ - output_dir: Directory to save downloaded files
397
+ - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
398
+ - keep_filtered_metadata: Whether to keep metadata for filtered documents
399
+ - standardize_metadata: Whether to standardize metadata format
400
+ """
401
+ downloader = Downloader(api_key=api_key)
402
+ downloader.QUEUE_SIZE = 1
403
+ downloader.MAX_CONCURRENT_DOWNLOADS = 1
404
+ downloader.download_files_using_filename(
405
+ filenames=filenames,
406
+ output_dir=output_dir,
407
+ keep_document_types=keep_document_types,
408
+ keep_filtered_metadata=keep_filtered_metadata,
409
+ standardize_metadata=standardize_metadata
321
410
  )
@@ -2,11 +2,80 @@ from pathlib import Path
2
2
  import json
3
3
  from .document.document import Document
4
4
  from secsgml import parse_sgml_content_into_memory
5
+ from secsgml.utils import bytes_to_str
6
+ from secsgml.parse_sgml import transform_metadata_string
5
7
  import tarfile
6
8
  import shutil
7
9
  import zstandard as zstd
8
- from io import BytesIO
9
10
  import gzip
11
+ import io
12
+ import copy
13
+
14
+
15
+ def calculate_documents_locations_in_tar(metadata, documents):
16
+ # Step 1: Add placeholder byte positions to get accurate size (10-digit padded)
17
+ placeholder_metadata = copy.deepcopy(metadata)
18
+
19
+ for file_num in range(len(documents)):
20
+ if 'documents' in placeholder_metadata:
21
+ placeholder_metadata['documents'][file_num]['secsgml_start_byte'] = "9999999999" # 10 digits
22
+ placeholder_metadata['documents'][file_num]['secsgml_end_byte'] = "9999999999" # 10 digits
23
+
24
+ # Step 2: Calculate size with placeholders
25
+ placeholder_str = bytes_to_str(placeholder_metadata, lower=False)
26
+ placeholder_json = json.dumps(placeholder_str).encode('utf-8')
27
+ metadata_size = len(placeholder_json)
28
+
29
+ # Step 3: Now calculate actual positions using this size
30
+ current_pos = 512 + metadata_size
31
+ current_pos += (512 - (current_pos % 512)) % 512
32
+
33
+ # Step 4: Calculate real positions and update original metadata (10-digit padded)
34
+ for file_num, content in enumerate(documents):
35
+ start_byte = current_pos + 512
36
+ end_byte = start_byte + len(content)
37
+
38
+ if 'documents' in metadata:
39
+ metadata['documents'][file_num]['secsgml_start_byte'] = f"{start_byte:010d}" # 10-digit padding
40
+ metadata['documents'][file_num]['secsgml_end_byte'] = f"{end_byte:010d}" # 10-digit padding
41
+
42
+
43
+ file_total_size = 512 + len(content)
44
+ padded_size = file_total_size + (512 - (file_total_size % 512)) % 512
45
+ current_pos += padded_size
46
+
47
+ return metadata
48
+
49
+
50
+ def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
51
+ # Write tar directly to disk
52
+ with tarfile.open(output_path, 'w') as tar:
53
+
54
+ # calculate document locations in tar
55
+ metadata = calculate_documents_locations_in_tar(metadata, documents)
56
+
57
+ # serialize metadata
58
+ metadata_str = bytes_to_str(metadata,lower=False)
59
+ metadata_json = json.dumps(metadata_str).encode('utf-8')
60
+ # save metadata
61
+ tarinfo = tarfile.TarInfo(name='metadata.json')
62
+ tarinfo.size = len(metadata_json)
63
+ tar.addfile(tarinfo, io.BytesIO(metadata_json))
64
+
65
+ for file_num, content in enumerate(documents, 0):
66
+ if standardize_metadata:
67
+ document_name = metadata['documents'][file_num]['filename'] if metadata['documents'][file_num].get('filename') else metadata['documents'][file_num]['sequence'] + '.txt'
68
+
69
+ compression = compression_list[file_num]
70
+ if compression == 'gzip':
71
+ document_name = f'{document_name}.gz'
72
+ elif compression == 'zstd':
73
+ document_name = f'{document_name}.zst'
74
+
75
+
76
+ tarinfo = tarfile.TarInfo(name=f'{document_name}')
77
+ tarinfo.size = len(content)
78
+ tar.addfile(tarinfo, io.BytesIO(content))
10
79
 
11
80
  class Submission:
12
81
  def __init__(self, path=None,sgml_content=None,keep_document_types=None):
@@ -18,6 +87,10 @@ class Submission:
18
87
  if sgml_content is not None:
19
88
  self.path = None
20
89
  metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
90
+
91
+ # standardize metadata
92
+ metadata = transform_metadata_string(metadata)
93
+
21
94
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
22
95
  # code dupe
23
96
  self.accession = self.metadata.content['accession-number']
@@ -55,6 +128,9 @@ class Submission:
55
128
  metadata_path = self.path / 'metadata.json'
56
129
  with metadata_path.open('r') as f:
57
130
  metadata = json.load(f)
131
+
132
+ # standardize metadata
133
+ metadata = transform_metadata_string(metadata)
58
134
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
59
135
  self.accession = self.metadata.content['accession-number']
60
136
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
@@ -68,51 +144,34 @@ class Submission:
68
144
  if compression is not None and compression not in ['gzip', 'zstd']:
69
145
  raise ValueError("compression must be 'gzip' or 'zstd'")
70
146
 
147
+ # check if we're loading from a dir or a tar file
148
+ is_dir_not_tar = True
149
+ if self.path.suffix == '.tar':
150
+ is_dir_not_tar = False
151
+ elif not self.path.is_dir():
152
+ raise ValueError("Path must be a directory to compress")
71
153
  # Create tar file (replace directory with .tar file)
72
154
  tar_path = self.path.with_suffix('.tar')
155
+
156
+ # load all files in the directory or tar file
157
+ documents = [doc.content.encode('utf-8') if isinstance(doc.content, str) else doc.content for doc in self]
73
158
 
74
- with tarfile.open(tar_path, 'w') as tar:
75
- # Add metadata.json first
76
- metadata_path = self.path / 'metadata.json'
77
- if metadata_path.exists():
78
- tar.add(metadata_path, arcname='metadata.json')
79
-
80
- # Add documents in order
81
- for doc in self.metadata.content['documents']:
82
- filename = doc.get('filename')
83
- if filename is None:
84
- filename = doc['sequence'] + '.txt'
85
-
86
- file_path = self.path / filename
87
- if file_path.exists():
88
- file_size = file_path.stat().st_size
89
159
 
90
-
91
- # Compress if compression specified and over threshold
92
- if compression is not None and file_size >= threshold:
93
- content = file_path.read_bytes()
94
-
95
- if compression == 'gzip':
96
- compressed_content = gzip.compress(content, compresslevel=level or 6)
97
- compressed_filename = filename + '.gz'
98
- else: # zstd
99
- cctx = zstd.ZstdCompressor(level=level or 3)
100
- compressed_content = cctx.compress(content)
101
- compressed_filename = filename + '.zst'
102
-
103
- # Add compressed file to tar
104
- tarinfo = tarfile.TarInfo(name=compressed_filename)
105
- tarinfo.size = len(compressed_content)
106
- tar.addfile(tarinfo, BytesIO(compressed_content))
107
- else:
108
- # Add uncompressed file
109
- tar.add(file_path, arcname=filename)
160
+ # we should compress everything here first.
161
+ compression_list = [compression if len(doc) >= threshold else '' for doc in documents]
162
+ documents = [gzip.compress(doc, compresslevel=level or 6) if compression == 'gzip' and
163
+ len(doc) >= threshold else zstd.ZstdCompressor(level=level or 3).compress(doc) if compression == 'zstd' and
164
+ len(doc) >= threshold else doc for doc in documents]
110
165
 
166
+ metadata = self.metadata.content.copy()
167
+ write_submission_to_tar(tar_path,metadata,documents,compression_list=compression_list,standardize_metadata=True)
168
+
111
169
  # Delete original folder
112
- shutil.rmtree(self.path)
113
-
114
- # Update path to point to new tar file
115
- self.path = tar_path
170
+ if is_dir_not_tar:
171
+ shutil.rmtree(self.path)
172
+ # otherwise, we already replaced the tar file
173
+ # Update path to point to new tar file
174
+ self.path = tar_path
116
175
 
117
176
  def decompress(self):
118
177
  if self.path is None:
@@ -129,17 +188,36 @@ class Submission:
129
188
  if member.isfile():
130
189
  content = tar.extractfile(member).read()
131
190
 
132
- # Decompress if gzipped
191
+ # Decompress based on file extension
133
192
  if member.name.endswith('.gz'):
134
193
  content = gzip.decompress(content)
135
194
  output_path = output_dir / member.name[:-3] # Remove .gz extension
195
+ elif member.name.endswith('.zst'):
196
+ dctx = zstd.ZstdDecompressor()
197
+ content = dctx.decompress(content)
198
+ output_path = output_dir / member.name[:-4] # Remove .zst extension
136
199
  else:
137
200
  output_path = output_dir / member.name
138
201
 
139
- # Write to output directory
140
- output_path.parent.mkdir(parents=True, exist_ok=True)
141
- with output_path.open('wb') as f:
142
- f.write(content)
202
+ # check if it is metadata.json
203
+ if output_path.name == 'metadata.json':
204
+ # load as json
205
+ metadata = json.loads(content.decode('utf-8'))
206
+ # remove SECSGML_START_BYTE and SECSGML_END_BYTE from documents
207
+ for doc in metadata['documents']:
208
+ if 'secsgml_start_byte' in doc:
209
+ del doc['secsgml_start_byte']
210
+
211
+ if 'secsgml_end_byte' in doc:
212
+ del doc['secsgml_end_byte']
213
+
214
+ with output_path.open('w', encoding='utf-8') as f:
215
+ json.dump(metadata, f)
216
+ else:
217
+ # Write to output directory
218
+ output_path.parent.mkdir(parents=True, exist_ok=True)
219
+ with output_path.open('wb') as f:
220
+ f.write(content)
143
221
 
144
222
  # delete original file
145
223
  self.path.unlink()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.5.2
3
+ Version: 1.5.4
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="1.5.2",
35
+ version="1.5.4",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes