datamule 1.6.2__tar.gz → 1.6.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {datamule-1.6.2 → datamule-1.6.4}/PKG-INFO +1 -1
  2. {datamule-1.6.2 → datamule-1.6.4}/datamule/portfolio.py +49 -45
  3. datamule-1.6.4/datamule/portfolio_compression_utils.py +291 -0
  4. {datamule-1.6.2 → datamule-1.6.4}/datamule/submission.py +56 -164
  5. {datamule-1.6.2 → datamule-1.6.4}/datamule.egg-info/PKG-INFO +1 -1
  6. {datamule-1.6.2 → datamule-1.6.4}/datamule.egg-info/SOURCES.txt +1 -0
  7. {datamule-1.6.2 → datamule-1.6.4}/setup.py +1 -1
  8. {datamule-1.6.2 → datamule-1.6.4}/datamule/__init__.py +0 -0
  9. {datamule-1.6.2 → datamule-1.6.4}/datamule/config.py +0 -0
  10. {datamule-1.6.2 → datamule-1.6.4}/datamule/data/listed_filer_metadata.csv +0 -0
  11. {datamule-1.6.2 → datamule-1.6.4}/datamule/datamule/__init__.py +0 -0
  12. {datamule-1.6.2 → datamule-1.6.4}/datamule/datamule/sec_connector.py +0 -0
  13. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/__init__.py +0 -0
  14. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/document.py +0 -0
  15. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/__init__.py +0 -0
  16. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/atsn.py +0 -0
  17. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/cfportal.py +0 -0
  18. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/d.py +0 -0
  19. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/ex102_abs.py +0 -0
  20. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/ex99a_sdr.py +0 -0
  21. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/ex99c_sdr.py +0 -0
  22. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/ex99g_sdr.py +0 -0
  23. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/ex99i_sdr.py +0 -0
  24. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/information_table.py +0 -0
  25. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/nmfp.py +0 -0
  26. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/npx.py +0 -0
  27. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/onefourtyfour.py +0 -0
  28. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/ownership.py +0 -0
  29. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/proxy_voting_record.py +0 -0
  30. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/sbs.py +0 -0
  31. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/sbsef.py +0 -0
  32. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/schedule13.py +0 -0
  33. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/sdr.py +0 -0
  34. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/submission_metadata.py +0 -0
  35. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/ta.py +0 -0
  36. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/thirteenfhr.py +0 -0
  37. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/twentyfivense.py +0 -0
  38. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings/twentyfourf2nt.py +0 -0
  39. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings_new/__init__.py +0 -0
  40. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings_new/mappings.py +0 -0
  41. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/mappings_new/ownership.py +0 -0
  42. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/processing.py +0 -0
  43. {datamule-1.6.2 → datamule-1.6.4}/datamule/document/table.py +0 -0
  44. {datamule-1.6.2 → datamule-1.6.4}/datamule/helper.py +0 -0
  45. {datamule-1.6.2 → datamule-1.6.4}/datamule/index.py +0 -0
  46. {datamule-1.6.2 → datamule-1.6.4}/datamule/mapping_dicts/__init__.py +0 -0
  47. {datamule-1.6.2 → datamule-1.6.4}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  48. {datamule-1.6.2 → datamule-1.6.4}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  49. {datamule-1.6.2 → datamule-1.6.4}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  50. {datamule-1.6.2 → datamule-1.6.4}/datamule/package_updater.py +0 -0
  51. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/__init__.py +0 -0
  52. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/infrastructure/__init__.py +0 -0
  53. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  54. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/submissions/__init__.py +0 -0
  55. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/submissions/downloader.py +0 -0
  56. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/submissions/eftsquery.py +0 -0
  57. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/submissions/monitor.py +0 -0
  58. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/submissions/streamer.py +0 -0
  59. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/submissions/textsearch.py +0 -0
  60. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/utils.py +0 -0
  61. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/xbrl/__init__.py +0 -0
  62. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  63. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  64. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  65. {datamule-1.6.2 → datamule-1.6.4}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  66. {datamule-1.6.2 → datamule-1.6.4}/datamule/seclibrary/__init__.py +0 -0
  67. {datamule-1.6.2 → datamule-1.6.4}/datamule/seclibrary/bq.py +0 -0
  68. {datamule-1.6.2 → datamule-1.6.4}/datamule/seclibrary/downloader.py +0 -0
  69. {datamule-1.6.2 → datamule-1.6.4}/datamule/seclibrary/query.py +0 -0
  70. {datamule-1.6.2 → datamule-1.6.4}/datamule/sheet.py +0 -0
  71. {datamule-1.6.2 → datamule-1.6.4}/datamule/utils/__init__.py +0 -0
  72. {datamule-1.6.2 → datamule-1.6.4}/datamule/utils/construct_submissions_data.py +0 -0
  73. {datamule-1.6.2 → datamule-1.6.4}/datamule.egg-info/dependency_links.txt +0 -0
  74. {datamule-1.6.2 → datamule-1.6.4}/datamule.egg-info/requires.txt +0 -0
  75. {datamule-1.6.2 → datamule-1.6.4}/datamule.egg-info/top_level.txt +0 -0
  76. {datamule-1.6.2 → datamule-1.6.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.6.2
3
+ Version: 1.6.4
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -12,9 +12,12 @@ from .helper import _process_cik_and_metadata_filters
12
12
  from .seclibrary.downloader import download as seclibrary_download
13
13
  from .sec.xbrl.filter_xbrl import filter_xbrl
14
14
  from .sec.submissions.monitor import Monitor
15
+ from .portfolio_compression_utils import CompressionManager
15
16
  #from .sec.xbrl.xbrlmonitor import XBRLMonitor
16
17
  from .datamule.sec_connector import SecConnector
17
-
18
+ from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
19
+ import json
20
+ import io
18
21
 
19
22
  class Portfolio:
20
23
  def __init__(self, path):
@@ -48,11 +51,7 @@ class Portfolio:
48
51
 
49
52
  # Load regular submissions (existing logic)
50
53
  def load_submission(folder):
51
- try:
52
- return Submission(folder)
53
- except Exception as e:
54
- print(f"Error loading submission from {folder}: {str(e)}")
55
- return None
54
+ return Submission(folder)
56
55
 
57
56
  regular_submissions = []
58
57
  if regular_items:
@@ -76,10 +75,7 @@ class Portfolio:
76
75
 
77
76
  # Collect results as they complete
78
77
  for future in as_completed(futures):
79
- try:
80
- batch_submissions.extend(future.result())
81
- except Exception as e:
82
- print(f"Error in batch processing: {str(e)}")
78
+ batch_submissions.extend(future.result())
83
79
 
84
80
  # Combine and filter None values
85
81
  self.submissions = [s for s in (regular_submissions + batch_submissions) if s is not None]
@@ -87,46 +83,54 @@ class Portfolio:
87
83
 
88
84
  def _load_batch_submissions_worker(self, batch_tar_path, pbar):
89
85
  """Worker function to load submissions from one batch tar with progress updates"""
90
- try:
91
- # Open tar handle and store it
92
- tar_handle = tarfile.open(batch_tar_path, 'r')
93
- self.batch_tar_handles[batch_tar_path] = tar_handle
94
- self.batch_tar_locks[batch_tar_path] = Lock()
95
-
96
- # Find all accession directories
97
- accession_prefixes = set()
98
- for member in tar_handle.getmembers():
99
- if '/' in member.name and member.name.endswith('metadata.json'):
100
- accession_prefix = member.name.split('/')[0]
101
- accession_prefixes.add(accession_prefix)
102
-
103
- # Create submissions for each accession
104
- submissions = []
105
- for accession_prefix in accession_prefixes:
106
- try:
107
- submission = Submission(
108
- batch_tar_path=batch_tar_path,
109
- accession_prefix=accession_prefix,
110
- portfolio_ref=self
111
- )
112
- submissions.append(submission)
113
- pbar.update(1) # Update progress for each successful submission
114
- except Exception as e:
115
- print(f"Error loading batch submission {accession_prefix} from {batch_tar_path.name}: {str(e)}")
116
-
117
- return submissions
86
+ # Open tar handle and store it
87
+ tar_handle = tarfile.open(batch_tar_path, 'r')
88
+ self.batch_tar_handles[batch_tar_path] = tar_handle
89
+ self.batch_tar_locks[batch_tar_path] = Lock()
90
+
91
+ # Find all accession directories
92
+ accession_prefixes = set()
93
+ for member in tar_handle.getmembers():
94
+ if '/' in member.name and member.name.endswith('metadata.json'):
95
+ accession_prefix = member.name.split('/')[0]
96
+ accession_prefixes.add(accession_prefix)
97
+
98
+ # Create submissions for each accession
99
+ submissions = []
100
+ for accession_prefix in accession_prefixes:
101
+ submission = Submission(
102
+ batch_tar_path=batch_tar_path,
103
+ accession_prefix=accession_prefix,
104
+ portfolio_ref=self
105
+ )
106
+ submissions.append(submission)
107
+ pbar.update(1) # Update progress for each successful submission
108
+
109
+ return submissions
118
110
 
119
- except Exception as e:
120
- print(f"Error loading batch tar {batch_tar_path}: {str(e)}")
121
- return []
111
+
112
+ def compress(self, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024):
113
+ """
114
+ Compress all individual submissions into batch tar files.
115
+
116
+ Args:
117
+ compression: None, 'gzip', or 'zstd' for document compression (default: None)
118
+ compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
119
+ threshold: Size threshold for compressing individual documents (default: 1MB)
120
+ max_batch_size: Maximum size per batch tar file (default: 1GB)
121
+ """
122
+ CompressionManager().compress_portfolio(self, compression, compression_level, threshold, max_batch_size, self.MAX_WORKERS)
123
+
124
+ def decompress(self):
125
+ """
126
+ Decompress all batch tar files back to individual submission directories.
127
+ """
128
+ CompressionManager().decompress_portfolio(self, self.MAX_WORKERS)
122
129
 
123
130
  def _close_batch_handles(self):
124
131
  """Close all open batch tar handles to free resources"""
125
132
  for handle in self.batch_tar_handles.values():
126
- try:
127
- handle.close()
128
- except Exception as e:
129
- print(f"Error closing batch tar handle: {str(e)}")
133
+ handle.close()
130
134
  self.batch_tar_handles.clear()
131
135
  self.batch_tar_locks.clear()
132
136
 
@@ -0,0 +1,291 @@
1
+ import json
2
+ import io
3
+ import gzip
4
+ import zstandard as zstd
5
+ import tarfile
6
+ import shutil
7
+ from tqdm import tqdm
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
10
+
11
+
12
+ class CompressionManager:
13
+
14
+ def compress_portfolio(self, portfolio, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024, max_workers=None):
15
+ """
16
+ Compress all individual submissions into batch tar files.
17
+
18
+ Args:
19
+ portfolio: Portfolio instance
20
+ compression: None, 'gzip', or 'zstd' for document compression (default: None)
21
+ compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
22
+ threshold: Size threshold for compressing individual documents (default: 1MB)
23
+ max_batch_size: Maximum size per batch tar file (default: 1GB)
24
+ max_workers: Number of threads for parallel document processing (default: portfolio.MAX_WORKERS)
25
+ """
26
+ if max_workers is None:
27
+ max_workers = portfolio.MAX_WORKERS
28
+
29
+ portfolio._close_batch_handles()
30
+
31
+ if not portfolio.submissions_loaded:
32
+ portfolio._load_submissions()
33
+
34
+ # Only compress non-batch submissions
35
+ submissions = [s for s in portfolio.submissions if s.batch_tar_path is None]
36
+
37
+ if not submissions:
38
+ print("No submissions to compress")
39
+ return
40
+
41
+ print(f"Compressing {len(submissions)} submissions...")
42
+
43
+ # Set default compression level if not specified
44
+ if compression_level is None:
45
+ compression_level = 6 if compression == 'gzip' else 3
46
+
47
+ # Group submissions into batches
48
+ current_batch = 0
49
+ current_size = 0
50
+ sequence = 1
51
+ current_tar = None
52
+
53
+ with tqdm(total=len(submissions), desc="Compressing submissions") as pbar:
54
+ for submission in submissions:
55
+ # Parallel document processing
56
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
57
+ doc_futures = [
58
+ executor.submit(self._process_document, doc, compression, threshold, compression_level)
59
+ for doc in submission
60
+ ]
61
+
62
+ # Collect results maintaining order
63
+ documents = []
64
+ compression_list = []
65
+ for future in doc_futures:
66
+ content, compression_type = future.result()
67
+ documents.append(content)
68
+ compression_list.append(compression_type)
69
+
70
+ # Calculate submission size
71
+ metadata_str = bytes_to_str(submission.metadata.content, lower=False)
72
+ metadata_json = json.dumps(metadata_str).encode('utf-8')
73
+ submission_size = len(metadata_json) + sum(len(doc) for doc in documents)
74
+
75
+ # Check if we need a new batch tar
76
+ if current_size > 0 and current_size + submission_size > max_batch_size:
77
+ if current_tar:
78
+ current_tar.close()
79
+ sequence += 1
80
+ current_size = 0
81
+ current_tar = None
82
+
83
+ # Create tar if needed
84
+ if current_tar is None:
85
+ batch_path = portfolio.path / f'batch_{current_batch:03d}_{sequence:03d}.tar'
86
+ current_tar = tarfile.open(batch_path, 'w')
87
+
88
+ # Write submission to tar
89
+ self._write_submission_to_tar(
90
+ current_tar,
91
+ submission,
92
+ documents,
93
+ compression_list,
94
+ submission.accession
95
+ )
96
+
97
+ current_size += submission_size
98
+
99
+ # Remove original submission directory/tar
100
+ if submission.path:
101
+ if submission.path.is_dir():
102
+ shutil.rmtree(submission.path)
103
+ elif submission.path.suffix == '.tar':
104
+ submission.path.unlink()
105
+
106
+ pbar.update(1)
107
+
108
+ # Close final tar
109
+ if current_tar:
110
+ current_tar.close()
111
+
112
+ # Reload submissions to reflect new batch structure
113
+ portfolio.submissions_loaded = False
114
+ portfolio._load_submissions()
115
+
116
+ print("Compression complete.")
117
+
118
+ def decompress_portfolio(self, portfolio, max_workers=None):
119
+ """
120
+ Decompress all batch tar files back to individual submission directories.
121
+
122
+ Args:
123
+ portfolio: Portfolio instance
124
+ max_workers: Number of threads for parallel file processing (default: portfolio.MAX_WORKERS)
125
+ """
126
+ if max_workers is None:
127
+ max_workers = portfolio.MAX_WORKERS
128
+
129
+ if not portfolio.submissions_loaded:
130
+ portfolio._load_submissions()
131
+
132
+ # Find all batch tar files
133
+ batch_tars = [f for f in portfolio.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
134
+
135
+ if not batch_tars:
136
+ print("No batch tar files found to decompress")
137
+ return
138
+
139
+ print(f"Decompressing {len(batch_tars)} batch tar files...")
140
+
141
+ # FIRST: Close all batch tar handles to free the files
142
+ portfolio._close_batch_handles()
143
+
144
+ total_extracted = 0
145
+
146
+ with tqdm(desc="Decompressing submissions", unit="submissions") as pbar:
147
+ for batch_tar in batch_tars:
148
+ with tarfile.open(batch_tar, 'r') as tar:
149
+ # Find all accession directories in this tar
150
+ accession_dirs = set()
151
+ for member in tar.getmembers():
152
+ if '/' in member.name:
153
+ accession_dir = member.name.split('/')[0]
154
+ accession_dirs.add(accession_dir)
155
+
156
+ # Extract each submission
157
+ for accession_dir in accession_dirs:
158
+ output_dir = portfolio.path / accession_dir
159
+ output_dir.mkdir(exist_ok=True)
160
+
161
+ # Get all files for this accession
162
+ accession_files = [m for m in tar.getmembers()
163
+ if m.name.startswith(f'{accession_dir}/') and m.isfile()]
164
+
165
+ # Parallel file extraction
166
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
167
+ file_futures = [
168
+ executor.submit(self._extract_file, member, tar, accession_dir, output_dir)
169
+ for member in accession_files
170
+ ]
171
+
172
+ # Wait for all files to be processed
173
+ for future in as_completed(file_futures):
174
+ future.result()
175
+
176
+ total_extracted += 1
177
+ pbar.update(1)
178
+
179
+
180
+ # NOW delete the batch tar files after everything is extracted
181
+ for batch_tar in batch_tars:
182
+ batch_tar.unlink()
183
+
184
+
185
+ # Reload submissions to reflect new directory structure
186
+ portfolio.submissions_loaded = False
187
+ portfolio._load_submissions()
188
+
189
+ print(f"Decompression complete. Extracted {total_extracted} submissions.")
190
+
191
+ def _process_document(self, doc, compression, threshold, compression_level):
192
+ """Process a single document: load content and apply compression if needed."""
193
+ content = doc.content
194
+ if isinstance(content, str):
195
+ content = content.encode('utf-8')
196
+
197
+ # Apply document-level compression if threshold met AND compression is specified
198
+ if compression and len(content) >= threshold:
199
+ if compression == 'gzip':
200
+ content = gzip.compress(content, compresslevel=compression_level)
201
+ compression_type = 'gzip'
202
+ elif compression == 'zstd':
203
+ content = zstd.ZstdCompressor(level=compression_level).compress(content)
204
+ compression_type = 'zstd'
205
+ else:
206
+ compression_type = ''
207
+ else:
208
+ compression_type = ''
209
+
210
+ return content, compression_type
211
+
212
+ def _extract_file(self, member, tar, accession_dir, output_dir):
213
+ """Extract and decompress a single file from tar."""
214
+ relative_path = member.name[len(accession_dir)+1:] # Remove accession prefix
215
+ output_path = output_dir / relative_path
216
+
217
+ content = tar.extractfile(member).read()
218
+
219
+ # Handle decompression based on filename
220
+ if relative_path.endswith('.gz'):
221
+ # File MUST be gzipped if it has .gz extension
222
+ content = gzip.decompress(content)
223
+ output_path = output_path.with_suffix('') # Remove .gz
224
+
225
+ elif relative_path.endswith('.zst'):
226
+ # File MUST be zstd compressed if it has .zst extension
227
+ content = zstd.ZstdDecompressor().decompress(content)
228
+ output_path = output_path.with_suffix('') # Remove .zst
229
+
230
+ # Special handling for metadata.json
231
+ if output_path.name == 'metadata.json':
232
+ metadata = json.loads(content.decode('utf-8'))
233
+ # Remove tar-specific metadata
234
+ for doc in metadata['documents']:
235
+ doc.pop('secsgml_start_byte', None)
236
+ doc.pop('secsgml_end_byte', None)
237
+
238
+ # Update filenames to match decompressed files
239
+ filename = doc.get('filename', '')
240
+ if filename.endswith('.gz'):
241
+ doc['filename'] = filename[:-3] # Remove .gz
242
+ elif filename.endswith('.zst'):
243
+ doc['filename'] = filename[:-4] # Remove .zst
244
+
245
+ with output_path.open('w', encoding='utf-8') as f:
246
+ json.dump(metadata, f, indent=2)
247
+ else:
248
+ # Write document file
249
+ output_path.parent.mkdir(parents=True, exist_ok=True)
250
+ with output_path.open('wb') as f:
251
+ f.write(content)
252
+
253
+
254
+ def _write_submission_to_tar(self, tar_handle, submission, documents, compression_list, accession_prefix):
255
+ """Write a submission to a tar file with optional document compression."""
256
+ # Prepare metadata
257
+ metadata = submission.metadata.content.copy()
258
+
259
+ # Update filenames for compressed documents BEFORE size calculation
260
+ for i, compression in enumerate(compression_list):
261
+ if compression:
262
+ doc = metadata['documents'][i]
263
+ filename = doc.get('filename', doc['sequence'] + '.txt')
264
+ if compression == 'gzip' and not filename.endswith('.gz'):
265
+ doc['filename'] = filename + '.gz'
266
+ elif compression == 'zstd' and not filename.endswith('.zst'):
267
+ doc['filename'] = filename + '.zst'
268
+
269
+ # Add document sizes to metadata for calculate_documents_locations_in_tar
270
+ for i, content in enumerate(documents):
271
+ metadata['documents'][i]['secsgml_size_bytes'] = len(content)
272
+
273
+ # NOW calculate document positions with the correct filenames
274
+ metadata = calculate_documents_locations_in_tar(metadata)
275
+
276
+ # Write metadata
277
+ metadata_str = bytes_to_str(metadata, lower=False)
278
+ metadata_json = json.dumps(metadata_str).encode('utf-8')
279
+
280
+ tarinfo = tarfile.TarInfo(name=f'{accession_prefix}/metadata.json')
281
+ tarinfo.size = len(metadata_json)
282
+ tar_handle.addfile(tarinfo, io.BytesIO(metadata_json))
283
+
284
+ # Write documents
285
+ for i, content in enumerate(documents):
286
+ doc = metadata['documents'][i]
287
+ filename = doc.get('filename', doc['sequence'] + '.txt')
288
+
289
+ tarinfo = tarfile.TarInfo(name=f'{accession_prefix}/{filename}')
290
+ tarinfo.size = len(content)
291
+ tar_handle.addfile(tarinfo, io.BytesIO(content))
@@ -2,46 +2,12 @@ from pathlib import Path
2
2
  import json
3
3
  from .document.document import Document
4
4
  from secsgml import parse_sgml_content_into_memory
5
- from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
6
5
  from secsgml.parse_sgml import transform_metadata_string
7
6
  import tarfile
8
- import shutil
9
7
  import zstandard as zstd
10
8
  import gzip
11
- import io
12
9
 
13
10
 
14
-
15
- def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
16
- # Write tar directly to disk
17
- with tarfile.open(output_path, 'w') as tar:
18
-
19
- # calculate document locations in tar
20
- metadata = calculate_documents_locations_in_tar(metadata, documents)
21
-
22
- # serialize metadata
23
- metadata_str = bytes_to_str(metadata,lower=False)
24
- metadata_json = json.dumps(metadata_str).encode('utf-8')
25
- # save metadata
26
- tarinfo = tarfile.TarInfo(name='metadata.json')
27
- tarinfo.size = len(metadata_json)
28
- tar.addfile(tarinfo, io.BytesIO(metadata_json))
29
-
30
- for file_num, content in enumerate(documents, 0):
31
- if standardize_metadata:
32
- document_name = metadata['documents'][file_num]['filename'] if metadata['documents'][file_num].get('filename') else metadata['documents'][file_num]['sequence'] + '.txt'
33
-
34
- compression = compression_list[file_num]
35
- if compression == 'gzip':
36
- document_name = f'{document_name}.gz'
37
- elif compression == 'zstd':
38
- document_name = f'{document_name}.zst'
39
-
40
-
41
- tarinfo = tarfile.TarInfo(name=f'{document_name}')
42
- tarinfo.size = len(content)
43
- tar.addfile(tarinfo, io.BytesIO(content))
44
-
45
11
  class Submission:
46
12
  def __init__(self, path=None, sgml_content=None, keep_document_types=None,
47
13
  batch_tar_path=None, accession_prefix=None, portfolio_ref=None):
@@ -128,94 +94,6 @@ class Submission:
128
94
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
129
95
  self.accession = self.metadata.content['accession-number']
130
96
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
131
-
132
-
133
-
134
- def compress(self, compression=None, level=None, threshold=1048576):
135
- if self.path is None:
136
- raise ValueError("Compress requires path")
137
-
138
- if compression is not None and compression not in ['gzip', 'zstd']:
139
- raise ValueError("compression must be 'gzip' or 'zstd'")
140
-
141
- # check if we're loading from a dir or a tar file
142
- is_dir_not_tar = True
143
- if self.path.suffix == '.tar':
144
- is_dir_not_tar = False
145
- elif not self.path.is_dir():
146
- raise ValueError("Path must be a directory to compress")
147
- # Create tar file (replace directory with .tar file)
148
- tar_path = self.path.with_suffix('.tar')
149
-
150
- # load all files in the directory or tar file
151
- documents = [doc.content.encode('utf-8') if isinstance(doc.content, str) else doc.content for doc in self]
152
-
153
-
154
- # we should compress everything here first.
155
- compression_list = [compression if len(doc) >= threshold else '' for doc in documents]
156
- documents = [gzip.compress(doc, compresslevel=level or 6) if compression == 'gzip' and
157
- len(doc) >= threshold else zstd.ZstdCompressor(level=level or 3).compress(doc) if compression == 'zstd' and
158
- len(doc) >= threshold else doc for doc in documents]
159
-
160
- metadata = self.metadata.content.copy()
161
- write_submission_to_tar(tar_path,metadata,documents,compression_list=compression_list,standardize_metadata=True)
162
-
163
- # Delete original folder
164
- if is_dir_not_tar:
165
- shutil.rmtree(self.path)
166
- # otherwise, we already replaced the tar file
167
- # Update path to point to new tar file
168
- self.path = tar_path
169
-
170
- def decompress(self):
171
- if self.path is None:
172
- raise ValueError("Decompress requires path")
173
- elif self.path.suffix != '.tar':
174
- raise ValueError("Can only decompress tar")
175
-
176
- # Create output directory (path without .tar extension)
177
- output_dir = self.path.with_suffix('')
178
- output_dir.mkdir(exist_ok=True)
179
-
180
- with tarfile.open(self.path, 'r') as tar:
181
- for member in tar.getmembers():
182
- if member.isfile():
183
- content = tar.extractfile(member).read()
184
-
185
- # Decompress based on file extension
186
- if member.name.endswith('.gz'):
187
- content = gzip.decompress(content)
188
- output_path = output_dir / member.name[:-3] # Remove .gz extension
189
- elif member.name.endswith('.zst'):
190
- dctx = zstd.ZstdDecompressor()
191
- content = dctx.decompress(content)
192
- output_path = output_dir / member.name[:-4] # Remove .zst extension
193
- else:
194
- output_path = output_dir / member.name
195
-
196
- # check if it is metadata.json
197
- if output_path.name == 'metadata.json':
198
- # load as json
199
- metadata = json.loads(content.decode('utf-8'))
200
- # remove SECSGML_START_BYTE and SECSGML_END_BYTE from documents
201
- for doc in metadata['documents']:
202
- if 'secsgml_start_byte' in doc:
203
- del doc['secsgml_start_byte']
204
-
205
- if 'secsgml_end_byte' in doc:
206
- del doc['secsgml_end_byte']
207
-
208
- with output_path.open('w', encoding='utf-8') as f:
209
- json.dump(metadata, f)
210
- else:
211
- # Write to output directory
212
- output_path.parent.mkdir(parents=True, exist_ok=True)
213
- with output_path.open('wb') as f:
214
- f.write(content)
215
-
216
- # delete original file
217
- self.path.unlink()
218
- self.path = output_dir
219
97
 
220
98
  def _load_document_by_index(self, idx):
221
99
  """Load a document by its index in the metadata documents list."""
@@ -225,44 +103,38 @@ class Submission:
225
103
  if self.path is None and self.batch_tar_path is None:
226
104
  return self.documents[idx]
227
105
 
228
- # Get filename
106
+ # Get filename from metadata - this is the source of truth
229
107
  filename = doc.get('filename')
230
108
  if filename is None:
231
109
  filename = doc['sequence'] + '.txt'
232
110
 
233
- extension = Path(filename).suffix
111
+ # Get the base extension (before any compression extension)
112
+ # If filename ends with .gz or .zst, the real extension is before that
113
+ if filename.endswith('.gz'):
114
+ extension = Path(filename[:-3]).suffix
115
+ is_compressed = 'gzip'
116
+ elif filename.endswith('.zst'):
117
+ extension = Path(filename[:-4]).suffix
118
+ is_compressed = 'zstd'
119
+ else:
120
+ extension = Path(filename).suffix
121
+ is_compressed = False
234
122
 
235
123
  # Handle batch tar case
236
124
  if self.batch_tar_path is not None:
237
125
  with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
238
126
  tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
239
127
 
240
- # Try different filename variations for compressed files
241
- possible_filenames = [
242
- f'{self.accession_prefix}/{filename}',
243
- f'{self.accession_prefix}/{filename}.gz',
244
- f'{self.accession_prefix}/{filename}.zst'
245
- ]
246
-
247
- content = None
248
- actual_filename = None
249
- for attempt_filename in possible_filenames:
250
- try:
251
- content = tar_handle.extractfile(attempt_filename).read()
252
- actual_filename = attempt_filename
253
- break
254
- except:
255
- continue
256
-
257
- if content is None:
258
- raise ValueError(f"Could not find document in batch tar: {self.batch_tar_path}, accession: {self.accession_prefix}, filename: {filename}")
128
+ # Use exact filename from metadata
129
+ tar_path = f'{self.accession_prefix}/{filename}'
130
+ content = tar_handle.extractfile(tar_path).read()
131
+
259
132
 
260
- # Decompress if compressed
261
- if actual_filename.endswith('.gz'):
133
+ # Decompress if needed based on filename extension
134
+ if is_compressed == 'gzip':
262
135
  content = gzip.decompress(content)
263
- elif actual_filename.endswith('.zst'):
264
- dctx = zstd.ZstdDecompressor()
265
- content = dctx.decompress(content)
136
+ elif is_compressed == 'zstd':
137
+ content = zstd.ZstdDecompressor().decompress(content)
266
138
 
267
139
  # Decode text files
268
140
  if extension in ['.htm', '.html', '.txt', '.xml']:
@@ -270,35 +142,56 @@ class Submission:
270
142
 
271
143
  document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
272
144
 
273
- # Handle regular path case (existing logic)
145
+ # Handle regular path case
274
146
  else:
275
- document_path = self.path / filename
276
-
147
+ # Check if path is a tar file (old format)
277
148
  if self.path.suffix == '.tar':
278
149
  with tarfile.open(self.path, 'r') as tar:
279
- # so here is where we should use bytes instead with byte offset.
280
- # bandaid fix TODO
150
+ # Try to extract the file, handling compression
281
151
  try:
282
152
  content = tar.extractfile(filename).read()
153
+ actual_filename = filename
283
154
  except:
284
155
  try:
285
- content = tar.extractfile(filename+'.gz').read()
156
+ content = tar.extractfile(filename + '.gz').read()
157
+ actual_filename = filename + '.gz'
158
+ is_compressed = 'gzip'
286
159
  except:
287
- try:
288
- content = tar.extractfile(filename+'.zst').read()
160
+ try:
161
+ content = tar.extractfile(filename + '.zst').read()
162
+ actual_filename = filename + '.zst'
163
+ is_compressed = 'zstd'
289
164
  except:
290
- # some of these issues are on SEC data end, will fix when I setup cloud.
291
- raise ValueError(f"Something went wrong with tar: {self.path}")
165
+ raise FileNotFoundError(f"Document file not found in tar: {filename}")
166
+
292
167
  # Decompress if compressed
293
- if filename.endswith('.gz'):
168
+ if is_compressed == 'gzip':
294
169
  content = gzip.decompress(content)
295
- elif filename.endswith('.zst'):
296
- dctx = zstd.ZstdDecompressor()
297
- content = dctx.decompress(content)
170
+ elif is_compressed == 'zstd':
171
+ content = zstd.ZstdDecompressor().decompress(content)
172
+
173
+ # Decode text files
174
+ if extension in ['.htm', '.html', '.txt', '.xml']:
175
+ content = content.decode('utf-8', errors='replace')
176
+
177
+ document_path = f"{self.path}::{actual_filename}"
178
+
298
179
  else:
180
+ # Regular directory case
181
+ document_path = self.path / filename
182
+
183
+ if not document_path.exists():
184
+ raise FileNotFoundError(f"Document file not found: {document_path}")
185
+
299
186
  with document_path.open('rb') as f:
300
187
  content = f.read()
301
-
188
+
189
+ # Decompress if needed based on filename extension
190
+ if is_compressed == 'gzip':
191
+ content = gzip.decompress(content)
192
+ elif is_compressed == 'zstd':
193
+ content = zstd.ZstdDecompressor().decompress(content)
194
+
302
195
  # Decode text files
303
196
  if extension in ['.htm', '.html', '.txt', '.xml']:
304
197
  content = content.decode('utf-8', errors='replace')
@@ -311,7 +204,6 @@ class Submission:
311
204
  accession=self.accession,
312
205
  path=document_path
313
206
  )
314
-
315
207
  def __iter__(self):
316
208
  """Make Submission iterable by yielding all documents."""
317
209
  for idx in range(len(self.metadata.content['documents'])):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.6.2
3
+ Version: 1.6.4
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -5,6 +5,7 @@ datamule/helper.py
5
5
  datamule/index.py
6
6
  datamule/package_updater.py
7
7
  datamule/portfolio.py
8
+ datamule/portfolio_compression_utils.py
8
9
  datamule/sheet.py
9
10
  datamule/submission.py
10
11
  datamule.egg-info/PKG-INFO
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="1.6.2",
35
+ version="1.6.4",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes