datamule 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/submission.py CHANGED
@@ -2,87 +2,28 @@ from pathlib import Path
2
2
  import json
3
3
  from .document.document import Document
4
4
  from secsgml import parse_sgml_content_into_memory
5
- from secsgml.utils import bytes_to_str
6
5
  from secsgml.parse_sgml import transform_metadata_string
7
6
  import tarfile
8
- import shutil
9
7
  import zstandard as zstd
10
8
  import gzip
11
- import io
12
- import copy
13
9
 
14
10
 
15
- def calculate_documents_locations_in_tar(metadata, documents):
16
- # Step 1: Add placeholder byte positions to get accurate size (10-digit padded)
17
- placeholder_metadata = copy.deepcopy(metadata)
18
-
19
- for file_num in range(len(documents)):
20
- if 'documents' in placeholder_metadata:
21
- placeholder_metadata['documents'][file_num]['secsgml_start_byte'] = "9999999999" # 10 digits
22
- placeholder_metadata['documents'][file_num]['secsgml_end_byte'] = "9999999999" # 10 digits
23
-
24
- # Step 2: Calculate size with placeholders
25
- placeholder_str = bytes_to_str(placeholder_metadata, lower=False)
26
- placeholder_json = json.dumps(placeholder_str).encode('utf-8')
27
- metadata_size = len(placeholder_json)
28
-
29
- # Step 3: Now calculate actual positions using this size
30
- current_pos = 512 + metadata_size
31
- current_pos += (512 - (current_pos % 512)) % 512
32
-
33
- # Step 4: Calculate real positions and update original metadata (10-digit padded)
34
- for file_num, content in enumerate(documents):
35
- start_byte = current_pos + 512
36
- end_byte = start_byte + len(content)
11
+ class Submission:
12
+ def __init__(self, path=None, sgml_content=None, keep_document_types=None,
13
+ batch_tar_path=None, accession_prefix=None, portfolio_ref=None):
37
14
 
38
- if 'documents' in metadata:
39
- metadata['documents'][file_num]['secsgml_start_byte'] = f"{start_byte:010d}" # 10-digit padding
40
- metadata['documents'][file_num]['secsgml_end_byte'] = f"{end_byte:010d}" # 10-digit padding
41
-
15
+ # Validate parameters
16
+ param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path])
17
+ if param_count != 1:
18
+ raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
42
19
 
43
- file_total_size = 512 + len(content)
44
- padded_size = file_total_size + (512 - (file_total_size % 512)) % 512
45
- current_pos += padded_size
46
-
47
- return metadata
48
-
49
-
50
- def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
51
- # Write tar directly to disk
52
- with tarfile.open(output_path, 'w') as tar:
53
-
54
- # calculate document locations in tar
55
- metadata = calculate_documents_locations_in_tar(metadata, documents)
20
+ if batch_tar_path is not None and (accession_prefix is None or portfolio_ref is None):
21
+ raise ValueError("batch_tar_path requires both accession_prefix and portfolio_ref")
56
22
 
57
- # serialize metadata
58
- metadata_str = bytes_to_str(metadata,lower=False)
59
- metadata_json = json.dumps(metadata_str).encode('utf-8')
60
- # save metadata
61
- tarinfo = tarfile.TarInfo(name='metadata.json')
62
- tarinfo.size = len(metadata_json)
63
- tar.addfile(tarinfo, io.BytesIO(metadata_json))
64
-
65
- for file_num, content in enumerate(documents, 0):
66
- if standardize_metadata:
67
- document_name = metadata['documents'][file_num]['filename'] if metadata['documents'][file_num].get('filename') else metadata['documents'][file_num]['sequence'] + '.txt'
68
-
69
- compression = compression_list[file_num]
70
- if compression == 'gzip':
71
- document_name = f'{document_name}.gz'
72
- elif compression == 'zstd':
73
- document_name = f'{document_name}.zst'
74
-
75
-
76
- tarinfo = tarfile.TarInfo(name=f'{document_name}')
77
- tarinfo.size = len(content)
78
- tar.addfile(tarinfo, io.BytesIO(content))
79
-
80
- class Submission:
81
- def __init__(self, path=None,sgml_content=None,keep_document_types=None):
82
- if path is None and sgml_content is None:
83
- raise ValueError("Either path or sgml_content must be provided")
84
- if path is not None and sgml_content is not None:
85
- raise ValueError("Only one of path or sgml_content must be provided")
23
+ # Initialize batch tar attributes
24
+ self.batch_tar_path = batch_tar_path
25
+ self.accession_prefix = accession_prefix
26
+ self.portfolio_ref = portfolio_ref
86
27
 
87
28
  if sgml_content is not None:
88
29
  self.path = None
@@ -100,7 +41,7 @@ class Submission:
100
41
  filtered_metadata_documents = []
101
42
 
102
43
  for idx,doc in enumerate(self.metadata.content['documents']):
103
- type = doc.get('type')()
44
+ type = doc.get('type')
104
45
 
105
46
  # Keep only specified types
106
47
  if keep_document_types is not None and type not in keep_document_types:
@@ -115,7 +56,26 @@ class Submission:
115
56
 
116
57
  self.metadata.content['documents'] = filtered_metadata_documents
117
58
 
118
- if path is not None:
59
+ elif batch_tar_path is not None:
60
+ # Batch tar case
61
+ self.path = None
62
+
63
+ # Load metadata from batch tar
64
+ with self.portfolio_ref.batch_tar_locks[batch_tar_path]:
65
+ tar_handle = self.portfolio_ref.batch_tar_handles[batch_tar_path]
66
+ metadata_obj = tar_handle.extractfile(f'{accession_prefix}/metadata.json')
67
+ metadata = json.loads(metadata_obj.read().decode('utf-8'))
68
+
69
+ # Set metadata path using :: notation
70
+ metadata_path = f"{batch_tar_path}::{accession_prefix}/metadata.json"
71
+
72
+ # standardize metadata
73
+ metadata = transform_metadata_string(metadata)
74
+ self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
75
+ self.accession = self.metadata.content['accession-number']
76
+ self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
77
+
78
+ elif path is not None:
119
79
  self.path = Path(path)
120
80
  if self.path.suffix == '.tar':
121
81
  with tarfile.open(self.path,'r') as tar:
@@ -134,135 +94,71 @@ class Submission:
134
94
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
135
95
  self.accession = self.metadata.content['accession-number']
136
96
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
137
-
138
-
139
-
140
- def compress(self, compression=None, level=None, threshold=1048576):
141
- if self.path is None:
142
- raise ValueError("Compress requires path")
143
-
144
- if compression is not None and compression not in ['gzip', 'zstd']:
145
- raise ValueError("compression must be 'gzip' or 'zstd'")
146
-
147
- # check if we're loading from a dir or a tar file
148
- is_dir_not_tar = True
149
- if self.path.suffix == '.tar':
150
- is_dir_not_tar = False
151
- elif not self.path.is_dir():
152
- raise ValueError("Path must be a directory to compress")
153
- # Create tar file (replace directory with .tar file)
154
- tar_path = self.path.with_suffix('.tar')
155
-
156
- # load all files in the directory or tar file
157
- documents = [doc.content.encode('utf-8') if isinstance(doc.content, str) else doc.content for doc in self]
158
-
159
-
160
- # we should compress everything here first.
161
- compression_list = [compression if len(doc) >= threshold else '' for doc in documents]
162
- documents = [gzip.compress(doc, compresslevel=level or 6) if compression == 'gzip' and
163
- len(doc) >= threshold else zstd.ZstdCompressor(level=level or 3).compress(doc) if compression == 'zstd' and
164
- len(doc) >= threshold else doc for doc in documents]
165
-
166
- metadata = self.metadata.content.copy()
167
- write_submission_to_tar(tar_path,metadata,documents,compression_list=compression_list,standardize_metadata=True)
168
-
169
- # Delete original folder
170
- if is_dir_not_tar:
171
- shutil.rmtree(self.path)
172
- # otherwise, we already replaced the tar file
173
- # Update path to point to new tar file
174
- self.path = tar_path
175
-
176
- def decompress(self):
177
- if self.path is None:
178
- raise ValueError("Decompress requires path")
179
- elif self.path.suffix != '.tar':
180
- raise ValueError("Can only decompress tar")
181
-
182
- # Create output directory (path without .tar extension)
183
- output_dir = self.path.with_suffix('')
184
- output_dir.mkdir(exist_ok=True)
185
-
186
- with tarfile.open(self.path, 'r') as tar:
187
- for member in tar.getmembers():
188
- if member.isfile():
189
- content = tar.extractfile(member).read()
190
-
191
- # Decompress based on file extension
192
- if member.name.endswith('.gz'):
193
- content = gzip.decompress(content)
194
- output_path = output_dir / member.name[:-3] # Remove .gz extension
195
- elif member.name.endswith('.zst'):
196
- dctx = zstd.ZstdDecompressor()
197
- content = dctx.decompress(content)
198
- output_path = output_dir / member.name[:-4] # Remove .zst extension
199
- else:
200
- output_path = output_dir / member.name
201
-
202
- # check if it is metadata.json
203
- if output_path.name == 'metadata.json':
204
- # load as json
205
- metadata = json.loads(content.decode('utf-8'))
206
- # remove SECSGML_START_BYTE and SECSGML_END_BYTE from documents
207
- for doc in metadata['documents']:
208
- if 'secsgml_start_byte' in doc:
209
- del doc['secsgml_start_byte']
210
-
211
- if 'secsgml_end_byte' in doc:
212
- del doc['secsgml_end_byte']
213
-
214
- with output_path.open('w', encoding='utf-8') as f:
215
- json.dump(metadata, f)
216
- else:
217
- # Write to output directory
218
- output_path.parent.mkdir(parents=True, exist_ok=True)
219
- with output_path.open('wb') as f:
220
- f.write(content)
221
-
222
- # delete original file
223
- self.path.unlink()
224
- self.path = output_dir
225
97
 
226
98
  def _load_document_by_index(self, idx):
227
99
  """Load a document by its index in the metadata documents list."""
228
100
  doc = self.metadata.content['documents'][idx]
229
101
 
230
102
  # If loaded from sgml_content, return pre-loaded document
231
- if self.path is None:
103
+ if self.path is None and self.batch_tar_path is None:
232
104
  return self.documents[idx]
233
105
 
234
- # If loaded from path, load document on-demand
106
+ # Get filename from metadata - this is the source of truth
235
107
  filename = doc.get('filename')
236
108
  if filename is None:
237
109
  filename = doc['sequence'] + '.txt'
238
110
 
239
- document_path = self.path / filename
240
- extension = document_path.suffix
111
+ # Get the base extension (before any compression extension)
112
+ # If filename ends with .gz or .zst, the real extension is before that
113
+ if filename.endswith('.gz'):
114
+ extension = Path(filename[:-3]).suffix
115
+ is_compressed = 'gzip'
116
+ elif filename.endswith('.zst'):
117
+ extension = Path(filename[:-4]).suffix
118
+ is_compressed = 'zstd'
119
+ else:
120
+ extension = Path(filename).suffix
121
+ is_compressed = False
241
122
 
242
- if self.path.suffix == '.tar':
243
- with tarfile.open(self.path, 'r') as tar:
244
- # bandaid fix TODO
245
- try:
246
- content = tar.extractfile(filename).read()
247
- except:
248
- try:
249
- content = tar.extractfile(filename+'.gz').read()
250
- except:
251
- try:
252
- content = tar.extractfile(filename+'.zst').read()
253
- except:
254
- # some of these issues are on SEC data end, will fix when I setup cloud.
255
- raise ValueError(f"Something went wrong with tar: {self.path}")
256
- # Decompress if compressed
257
- if filename.endswith('.gz'):
123
+ # Handle batch tar case
124
+ if self.batch_tar_path is not None:
125
+ with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
126
+ tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
127
+
128
+ # Use exact filename from metadata
129
+ tar_path = f'{self.accession_prefix}/{filename}'
130
+ content = tar_handle.extractfile(tar_path).read()
131
+
132
+
133
+ # Decompress if needed based on filename extension
134
+ if is_compressed == 'gzip':
258
135
  content = gzip.decompress(content)
259
- elif filename.endswith('.zst'):
260
- dctx = zstd.ZstdDecompressor()
261
- content = dctx.decompress(content)
136
+ elif is_compressed == 'zstd':
137
+ content = zstd.ZstdDecompressor().decompress(content)
138
+
139
+ # Decode text files
140
+ if extension in ['.htm', '.html', '.txt', '.xml']:
141
+ content = content.decode('utf-8', errors='replace')
142
+
143
+ document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
144
+
145
+ # Handle regular path case
262
146
  else:
147
+ # Use exact filename from metadata
148
+ document_path = self.path / filename
149
+
150
+ if not document_path.exists():
151
+ raise FileNotFoundError(f"Document file not found: {document_path}")
152
+
263
153
  with document_path.open('rb') as f:
264
154
  content = f.read()
265
-
155
+
156
+ # Decompress if needed based on filename extension
157
+ if is_compressed == 'gzip':
158
+ content = gzip.decompress(content)
159
+ elif is_compressed == 'zstd':
160
+ content = zstd.ZstdDecompressor().decompress(content)
161
+
266
162
  # Decode text files
267
163
  if extension in ['.htm', '.html', '.txt', '.xml']:
268
164
  content = content.decode('utf-8', errors='replace')
@@ -41,9 +41,9 @@ def process_file_batch(zip_file, filenames_batch):
41
41
  # Create filing records for this file
42
42
  for j in range(len(accession_numbers)):
43
43
  filing_record = {
44
- 'accessionNumber': accession_numbers[j],
44
+ 'accessionNumber': int(accession_numbers[j].replace('-','')),
45
45
  'filingDate': filing_dates[j],
46
- 'form': forms[j],
46
+ 'submissionType': forms[j],
47
47
  'cik': cik
48
48
  }
49
49
  batch_filings.append(filing_record)
@@ -59,13 +59,13 @@ def write_csv_chunk(output_path, filings_data, is_first_write, write_lock):
59
59
  with write_lock:
60
60
  if is_first_write:
61
61
  with open(output_path, 'w', newline='') as csvfile:
62
- fieldnames = ['accessionNumber', 'filingDate', 'form', 'cik']
62
+ fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
63
63
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
64
64
  writer.writeheader()
65
65
  writer.writerows(filings_data)
66
66
  else:
67
67
  with open(output_path, 'a', newline='') as csvfile:
68
- fieldnames = ['accessionNumber', 'filingDate', 'form', 'cik']
68
+ fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
69
69
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
70
70
  writer.writerows(filings_data)
71
71
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.6.1
3
+ Version: 1.6.3
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -3,9 +3,10 @@ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
3
  datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
4
4
  datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
5
5
  datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
6
- datamule/portfolio.py,sha256=eF1eDSwIg-CI8ZmZAHRjCGU0UhuPN4ijxPB0YDT4s2o,8023
6
+ datamule/portfolio.py,sha256=tADqQMkFaFyjanbJ0QcaOHGdJJB254rOg29FW7a13l0,11835
7
+ datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
7
8
  datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
8
- datamule/submission.py,sha256=vAiYNas1YrWgm4Grw24peJbfSUVERySEko1zmdtG49s,13033
9
+ datamule/submission.py,sha256=yDPglaFJ65nXn7Lxh-JFTQGKVVmBJDHBVWTf4UEUm2M,8610
9
10
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
10
11
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
12
  datamule/datamule/sec_connector.py,sha256=T3edE7I-d4oHysqj7zYlIOxH3Fuauj9tfw39UdFWvB8,2393
@@ -61,11 +62,11 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
61
62
  datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
62
63
  datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
64
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
64
- datamule/seclibrary/downloader.py,sha256=ylv69VF22IVfrdeCkiGr5mVa2GKrPC9zFiDJU1fiBu8,17262
65
+ datamule/seclibrary/downloader.py,sha256=3jEy67oiEg8BF20KcKCx2KC0UjHzhiepdu29TOaHWXs,17564
65
66
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
66
67
  datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
- datamule/utils/construct_submissions_data.py,sha256=Jn37Ra2_nCIalATCjP_484eUiFP_YeglX_uNdK4Qfu8,5883
68
- datamule-1.6.1.dist-info/METADATA,sha256=0SEtRwvbaGgU-x_D8u3n0MUPYLssODtQf4GhQrGfl7s,524
69
- datamule-1.6.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
70
- datamule-1.6.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
71
- datamule-1.6.1.dist-info/RECORD,,
68
+ datamule/utils/construct_submissions_data.py,sha256=aX7ZaAp3zXHLcv4TFk_rGwjb8r7yNDQDFVg4nPf60kM,5934
69
+ datamule-1.6.3.dist-info/METADATA,sha256=9tb_ecnMVFHYq-Jcj_O0xAYUtM6v2PEZRxdEtPnorD4,524
70
+ datamule-1.6.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
71
+ datamule-1.6.3.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
72
+ datamule-1.6.3.dist-info/RECORD,,