datamule 1.5.2__tar.gz → 1.5.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {datamule-1.5.2 → datamule-1.5.3}/PKG-INFO +1 -1
  2. {datamule-1.5.2 → datamule-1.5.3}/datamule/portfolio.py +8 -4
  3. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/submissions/downloader.py +3 -2
  4. {datamule-1.5.2 → datamule-1.5.3}/datamule/seclibrary/downloader.py +15 -8
  5. {datamule-1.5.2 → datamule-1.5.3}/datamule/submission.py +115 -45
  6. {datamule-1.5.2 → datamule-1.5.3}/datamule.egg-info/PKG-INFO +1 -1
  7. {datamule-1.5.2 → datamule-1.5.3}/setup.py +1 -1
  8. {datamule-1.5.2 → datamule-1.5.3}/datamule/__init__.py +0 -0
  9. {datamule-1.5.2 → datamule-1.5.3}/datamule/config.py +0 -0
  10. {datamule-1.5.2 → datamule-1.5.3}/datamule/data/listed_filer_metadata.csv +0 -0
  11. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/__init__.py +0 -0
  12. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/document.py +0 -0
  13. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/__init__.py +0 -0
  14. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/atsn.py +0 -0
  15. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/cfportal.py +0 -0
  16. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/d.py +0 -0
  17. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/ex102_abs.py +0 -0
  18. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/ex99a_sdr.py +0 -0
  19. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/ex99c_sdr.py +0 -0
  20. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/ex99g_sdr.py +0 -0
  21. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/ex99i_sdr.py +0 -0
  22. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/information_table.py +0 -0
  23. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/nmfp.py +0 -0
  24. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/npx.py +0 -0
  25. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/onefourtyfour.py +0 -0
  26. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/ownership.py +0 -0
  27. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/proxy_voting_record.py +0 -0
  28. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/sbs.py +0 -0
  29. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/sbsef.py +0 -0
  30. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/schedule13.py +0 -0
  31. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/sdr.py +0 -0
  32. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/submission_metadata.py +0 -0
  33. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/ta.py +0 -0
  34. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/thirteenfhr.py +0 -0
  35. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/twentyfivense.py +0 -0
  36. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/mappings/twentyfourf2nt.py +0 -0
  37. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/processing.py +0 -0
  38. {datamule-1.5.2 → datamule-1.5.3}/datamule/document/table.py +0 -0
  39. {datamule-1.5.2 → datamule-1.5.3}/datamule/helper.py +0 -0
  40. {datamule-1.5.2 → datamule-1.5.3}/datamule/index.py +0 -0
  41. {datamule-1.5.2 → datamule-1.5.3}/datamule/mapping_dicts/__init__.py +0 -0
  42. {datamule-1.5.2 → datamule-1.5.3}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  43. {datamule-1.5.2 → datamule-1.5.3}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  44. {datamule-1.5.2 → datamule-1.5.3}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  45. {datamule-1.5.2 → datamule-1.5.3}/datamule/package_updater.py +0 -0
  46. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/__init__.py +0 -0
  47. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/infrastructure/__init__.py +0 -0
  48. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  49. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/submissions/__init__.py +0 -0
  50. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/submissions/eftsquery.py +0 -0
  51. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/submissions/monitor.py +0 -0
  52. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/submissions/streamer.py +0 -0
  53. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/submissions/textsearch.py +0 -0
  54. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/utils.py +0 -0
  55. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/xbrl/__init__.py +0 -0
  56. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  57. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  58. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  59. {datamule-1.5.2 → datamule-1.5.3}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  60. {datamule-1.5.2 → datamule-1.5.3}/datamule/seclibrary/__init__.py +0 -0
  61. {datamule-1.5.2 → datamule-1.5.3}/datamule/seclibrary/bq.py +0 -0
  62. {datamule-1.5.2 → datamule-1.5.3}/datamule/seclibrary/query.py +0 -0
  63. {datamule-1.5.2 → datamule-1.5.3}/datamule/sheet.py +0 -0
  64. {datamule-1.5.2 → datamule-1.5.3}/datamule.egg-info/SOURCES.txt +0 -0
  65. {datamule-1.5.2 → datamule-1.5.3}/datamule.egg-info/dependency_links.txt +0 -0
  66. {datamule-1.5.2 → datamule-1.5.3}/datamule.egg-info/requires.txt +0 -0
  67. {datamule-1.5.2 → datamule-1.5.3}/datamule.egg-info/top_level.txt +0 -0
  68. {datamule-1.5.2 → datamule-1.5.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.5.2
3
+ Version: 1.5.3
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -34,7 +34,6 @@ class Portfolio:
34
34
 
35
35
  def _load_submissions(self):
36
36
  folders = [f for f in self.path.iterdir() if f.is_dir() or f.suffix=='.tar']
37
- print(folders)
38
37
  print(f"Loading {len(folders)} submissions")
39
38
 
40
39
  def load_submission(folder):
@@ -126,7 +125,8 @@ class Portfolio:
126
125
  # First query, just set the accession numbers
127
126
  self.accession_numbers = new_accession_numbers
128
127
 
129
- def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],requests_per_second=5, **kwargs):
128
+ def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],
129
+ requests_per_second=5,keep_filtered_metadata=False,standardize_metadata=True, **kwargs):
130
130
  if provider is None:
131
131
  config = Config()
132
132
  provider = config.get_default_source()
@@ -143,7 +143,9 @@ class Portfolio:
143
143
  submission_type=submission_type,
144
144
  filing_date=filing_date,
145
145
  accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
146
- keep_document_types=document_type
146
+ keep_document_types=document_type,
147
+ keep_filtered_metadata=keep_filtered_metadata,
148
+ standardize_metadata=standardize_metadata,
147
149
  )
148
150
  else:
149
151
  sec_download(
@@ -153,7 +155,9 @@ class Portfolio:
153
155
  filing_date=filing_date,
154
156
  requests_per_second=requests_per_second,
155
157
  accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
156
- keep_document_types=document_type
158
+ keep_document_types=document_type,
159
+ keep_filtered_metadata=keep_filtered_metadata,
160
+ standardize_metadata=standardize_metadata,
157
161
  )
158
162
 
159
163
  self.submissions_loaded = False
@@ -5,7 +5,7 @@ from tqdm import tqdm
5
5
 
6
6
  def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
7
7
  requests_per_second=5, output_dir="filings", accession_numbers=None,
8
- quiet=False, keep_document_types=[]):
8
+ quiet=False, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
9
9
  # Make sure output directory exists
10
10
  os.makedirs(output_dir, exist_ok=True)
11
11
 
@@ -14,7 +14,8 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
14
14
  # Create a wrapper for the download_callback that includes the output_dir
15
15
  async def callback_wrapper(hit, content, cik, accno, url):
16
16
  output_path = os.path.join(output_dir, accno.replace('-','') + '.tar')
17
- write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=keep_document_types)
17
+ write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=keep_document_types,keep_filtered_metadata=keep_filtered_metadata,
18
+ standardize_metadata=standardize_metadata)
18
19
  pbar.update(1)
19
20
 
20
21
 
@@ -74,7 +74,7 @@ class Downloader:
74
74
  print(f"Failed to log error to {error_file}: {str(e)}")
75
75
 
76
76
  class FileProcessor:
77
- def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[]):
77
+ def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[], keep_filtered_metadata=False,standardize_metadata=True):
78
78
  self.processing_queue = Queue(maxsize=queue_size)
79
79
  self.should_stop = False
80
80
  self.processing_workers = []
@@ -84,6 +84,8 @@ class Downloader:
84
84
  self.pbar = pbar
85
85
  self.downloader = downloader
86
86
  self.keep_document_types = keep_document_types
87
+ self.keep_filtered_metadata = keep_filtered_metadata
88
+ self.standardize_metadata = standardize_metadata
87
89
 
88
90
  def start_processing_workers(self):
89
91
  for _ in range(self.max_workers):
@@ -95,7 +97,7 @@ class Downloader:
95
97
  def _process_file(self, item):
96
98
  filename, content = item
97
99
  output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
98
- write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types)
100
+ write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types, keep_filtered_metadata=self.keep_filtered_metadata,standardize_metadata=self.standardize_metadata)
99
101
  self.pbar.update(1)
100
102
 
101
103
  def _processing_worker(self):
@@ -204,11 +206,12 @@ class Downloader:
204
206
  except Exception as e:
205
207
  self._log_error(output_dir, filename, str(e))
206
208
 
207
- async def process_batch(self, urls, output_dir, keep_document_types=[]):
209
+ async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
208
210
  os.makedirs(output_dir, exist_ok=True)
209
211
 
210
212
  with tqdm(total=len(urls), desc="Processing files") as pbar:
211
- processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types)
213
+ processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types,
214
+ keep_filtered_metadata=keep_filtered_metadata,standardize_metadata=standardize_metadata)
212
215
  processor.start_processing_workers()
213
216
 
214
217
  semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
@@ -231,7 +234,7 @@ class Downloader:
231
234
  processor.stop_workers()
232
235
  decompression_pool.shutdown()
233
236
 
234
- def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
237
+ def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
235
238
  """
236
239
  Query SEC filings and download/process them.
237
240
 
@@ -242,6 +245,7 @@ class Downloader:
242
245
  - output_dir: Directory to save downloaded files
243
246
  - accession_numbers: List of specific accession numbers to download
244
247
  - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
248
+ - keep_filtered_metadata: Whether to keep metadata for filtered documents
245
249
  """
246
250
  if self.api_key is None:
247
251
  raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
@@ -279,7 +283,7 @@ class Downloader:
279
283
  start_time = time.time()
280
284
 
281
285
  # Process the batch asynchronously
282
- asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types))
286
+ asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata))
283
287
 
284
288
  # Calculate and display performance metrics
285
289
  elapsed_time = time.time() - start_time
@@ -292,7 +296,7 @@ class Downloader:
292
296
  self.loop.call_soon_threadsafe(self.loop.stop)
293
297
 
294
298
 
295
- def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
299
+ def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
296
300
  """
297
301
  Query SEC filings and download/process them.
298
302
 
@@ -304,6 +308,7 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
304
308
  - output_dir: Directory to save downloaded files
305
309
  - accession_numbers: List of specific accession numbers to download
306
310
  - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
311
+ - keep_filtered_metadata: Whether to keep metadata for filtered documents
307
312
  """
308
313
  if accession_numbers:
309
314
  accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
@@ -317,5 +322,7 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
317
322
  filing_date=filing_date,
318
323
  output_dir=output_dir,
319
324
  accession_numbers=accession_numbers,
320
- keep_document_types=keep_document_types
325
+ keep_document_types=keep_document_types,
326
+ keep_filtered_metadata=keep_filtered_metadata,
327
+ standardize_metadata=standardize_metadata
321
328
  )
@@ -2,11 +2,79 @@ from pathlib import Path
2
2
  import json
3
3
  from .document.document import Document
4
4
  from secsgml import parse_sgml_content_into_memory
5
+ from secsgml.utils import bytes_to_str
5
6
  import tarfile
6
7
  import shutil
7
8
  import zstandard as zstd
8
- from io import BytesIO
9
9
  import gzip
10
+ import io
11
+ import copy
12
+
13
+
14
+ def calculate_documents_locations_in_tar(metadata, documents):
15
+ # Step 1: Add placeholder byte positions to get accurate size (10-digit padded)
16
+ placeholder_metadata = copy.deepcopy(metadata)
17
+
18
+ for file_num in range(len(documents)):
19
+ if 'documents' in placeholder_metadata:
20
+ placeholder_metadata['documents'][file_num]['secsgml_start_byte'] = "9999999999" # 10 digits
21
+ placeholder_metadata['documents'][file_num]['secsgml_end_byte'] = "9999999999" # 10 digits
22
+
23
+ # Step 2: Calculate size with placeholders
24
+ placeholder_str = bytes_to_str(placeholder_metadata, lower=False)
25
+ placeholder_json = json.dumps(placeholder_str).encode('utf-8')
26
+ metadata_size = len(placeholder_json)
27
+
28
+ # Step 3: Now calculate actual positions using this size
29
+ current_pos = 512 + metadata_size
30
+ current_pos += (512 - (current_pos % 512)) % 512
31
+
32
+ # Step 4: Calculate real positions and update original metadata (10-digit padded)
33
+ for file_num, content in enumerate(documents):
34
+ start_byte = current_pos + 512
35
+ end_byte = start_byte + len(content)
36
+
37
+ if 'documents' in metadata:
38
+ metadata['documents'][file_num]['secsgml_start_byte'] = f"{start_byte:010d}" # 10-digit padding
39
+ metadata['documents'][file_num]['secsgml_end_byte'] = f"{end_byte:010d}" # 10-digit padding
40
+
41
+
42
+ file_total_size = 512 + len(content)
43
+ padded_size = file_total_size + (512 - (file_total_size % 512)) % 512
44
+ current_pos += padded_size
45
+
46
+ return metadata
47
+
48
+
49
+ def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
50
+ # Write tar directly to disk
51
+ with tarfile.open(output_path, 'w') as tar:
52
+
53
+ # calculate document locations in tar
54
+ metadata = calculate_documents_locations_in_tar(metadata, documents)
55
+
56
+ # serialize metadata
57
+ metadata_str = bytes_to_str(metadata,lower=False)
58
+ metadata_json = json.dumps(metadata_str).encode('utf-8')
59
+ # save metadata
60
+ tarinfo = tarfile.TarInfo(name='metadata.json')
61
+ tarinfo.size = len(metadata_json)
62
+ tar.addfile(tarinfo, io.BytesIO(metadata_json))
63
+
64
+ for file_num, content in enumerate(documents, 0):
65
+ if standardize_metadata:
66
+ document_name = metadata['documents'][file_num]['filename'] if metadata['documents'][file_num].get('filename') else metadata['documents'][file_num]['sequence'] + '.txt'
67
+
68
+ compression = compression_list[file_num]
69
+ if compression == 'gzip':
70
+ document_name = f'{document_name}.gz'
71
+ elif compression == 'zstd':
72
+ document_name = f'{document_name}.zst'
73
+
74
+
75
+ tarinfo = tarfile.TarInfo(name=f'{document_name}')
76
+ tarinfo.size = len(content)
77
+ tar.addfile(tarinfo, io.BytesIO(content))
10
78
 
11
79
  class Submission:
12
80
  def __init__(self, path=None,sgml_content=None,keep_document_types=None):
@@ -68,51 +136,34 @@ class Submission:
68
136
  if compression is not None and compression not in ['gzip', 'zstd']:
69
137
  raise ValueError("compression must be 'gzip' or 'zstd'")
70
138
 
139
+ # check if we're loading from a dir or a tar file
140
+ is_dir_not_tar = True
141
+ if self.path.suffix == '.tar':
142
+ is_dir_not_tar = False
143
+ elif not self.path.is_dir():
144
+ raise ValueError("Path must be a directory to compress")
71
145
  # Create tar file (replace directory with .tar file)
72
146
  tar_path = self.path.with_suffix('.tar')
147
+
148
+ # load all files in the directory or tar file
149
+ documents = [doc.content.encode('utf-8') if isinstance(doc.content, str) else doc.content for doc in self]
73
150
 
74
- with tarfile.open(tar_path, 'w') as tar:
75
- # Add metadata.json first
76
- metadata_path = self.path / 'metadata.json'
77
- if metadata_path.exists():
78
- tar.add(metadata_path, arcname='metadata.json')
79
-
80
- # Add documents in order
81
- for doc in self.metadata.content['documents']:
82
- filename = doc.get('filename')
83
- if filename is None:
84
- filename = doc['sequence'] + '.txt'
85
-
86
- file_path = self.path / filename
87
- if file_path.exists():
88
- file_size = file_path.stat().st_size
89
151
 
90
-
91
- # Compress if compression specified and over threshold
92
- if compression is not None and file_size >= threshold:
93
- content = file_path.read_bytes()
94
-
95
- if compression == 'gzip':
96
- compressed_content = gzip.compress(content, compresslevel=level or 6)
97
- compressed_filename = filename + '.gz'
98
- else: # zstd
99
- cctx = zstd.ZstdCompressor(level=level or 3)
100
- compressed_content = cctx.compress(content)
101
- compressed_filename = filename + '.zst'
102
-
103
- # Add compressed file to tar
104
- tarinfo = tarfile.TarInfo(name=compressed_filename)
105
- tarinfo.size = len(compressed_content)
106
- tar.addfile(tarinfo, BytesIO(compressed_content))
107
- else:
108
- # Add uncompressed file
109
- tar.add(file_path, arcname=filename)
152
+ # we should compress everything here first.
153
+ compression_list = [compression if len(doc) >= threshold else '' for doc in documents]
154
+ documents = [gzip.compress(doc, compresslevel=level or 6) if compression == 'gzip' and
155
+ len(doc) >= threshold else zstd.ZstdCompressor(level=level or 3).compress(doc) if compression == 'zstd' and
156
+ len(doc) >= threshold else doc for doc in documents]
110
157
 
158
+ metadata = self.metadata.content.copy()
159
+ write_submission_to_tar(tar_path,metadata,documents,compression_list=compression_list,standardize_metadata=True)
160
+
111
161
  # Delete original folder
112
- shutil.rmtree(self.path)
113
-
114
- # Update path to point to new tar file
115
- self.path = tar_path
162
+ if is_dir_not_tar:
163
+ shutil.rmtree(self.path)
164
+ # otherwise, we already replaced the tar file
165
+ # Update path to point to new tar file
166
+ self.path = tar_path
116
167
 
117
168
  def decompress(self):
118
169
  if self.path is None:
@@ -129,17 +180,36 @@ class Submission:
129
180
  if member.isfile():
130
181
  content = tar.extractfile(member).read()
131
182
 
132
- # Decompress if gzipped
183
+ # Decompress based on file extension
133
184
  if member.name.endswith('.gz'):
134
185
  content = gzip.decompress(content)
135
186
  output_path = output_dir / member.name[:-3] # Remove .gz extension
187
+ elif member.name.endswith('.zst'):
188
+ dctx = zstd.ZstdDecompressor()
189
+ content = dctx.decompress(content)
190
+ output_path = output_dir / member.name[:-4] # Remove .zst extension
136
191
  else:
137
192
  output_path = output_dir / member.name
138
193
 
139
- # Write to output directory
140
- output_path.parent.mkdir(parents=True, exist_ok=True)
141
- with output_path.open('wb') as f:
142
- f.write(content)
194
+ # check if it is metadata.json
195
+ if output_path.name == 'metadata.json':
196
+ # load as json
197
+ metadata = json.loads(content.decode('utf-8'))
198
+ # remove SECSGML_START_BYTE and SECSGML_END_BYTE from documents
199
+ for doc in metadata['documents']:
200
+ if 'secsgml_start_byte' in doc:
201
+ del doc['secsgml_start_byte']
202
+
203
+ if 'secsgml_end_byte' in doc:
204
+ del doc['secsgml_end_byte']
205
+
206
+ with output_path.open('w', encoding='utf-8') as f:
207
+ json.dump(metadata, f)
208
+ else:
209
+ # Write to output directory
210
+ output_path.parent.mkdir(parents=True, exist_ok=True)
211
+ with output_path.open('wb') as f:
212
+ f.write(content)
143
213
 
144
214
  # delete original file
145
215
  self.path.unlink()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.5.2
3
+ Version: 1.5.3
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="1.5.2",
35
+ version="1.5.3",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes