datamule 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/portfolio.py +102 -18
- datamule/portfolio_compression_utils.py +291 -0
- datamule/seclibrary/downloader.py +163 -161
- datamule/submission.py +82 -186
- datamule/utils/construct_submissions_data.py +4 -4
- {datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/METADATA +1 -1
- {datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/RECORD +9 -8
- {datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/WHEEL +0 -0
- {datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/top_level.txt +0 -0
datamule/submission.py
CHANGED
@@ -2,87 +2,28 @@ from pathlib import Path
|
|
2
2
|
import json
|
3
3
|
from .document.document import Document
|
4
4
|
from secsgml import parse_sgml_content_into_memory
|
5
|
-
from secsgml.utils import bytes_to_str
|
6
5
|
from secsgml.parse_sgml import transform_metadata_string
|
7
6
|
import tarfile
|
8
|
-
import shutil
|
9
7
|
import zstandard as zstd
|
10
8
|
import gzip
|
11
|
-
import io
|
12
|
-
import copy
|
13
9
|
|
14
10
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
for file_num in range(len(documents)):
|
20
|
-
if 'documents' in placeholder_metadata:
|
21
|
-
placeholder_metadata['documents'][file_num]['secsgml_start_byte'] = "9999999999" # 10 digits
|
22
|
-
placeholder_metadata['documents'][file_num]['secsgml_end_byte'] = "9999999999" # 10 digits
|
23
|
-
|
24
|
-
# Step 2: Calculate size with placeholders
|
25
|
-
placeholder_str = bytes_to_str(placeholder_metadata, lower=False)
|
26
|
-
placeholder_json = json.dumps(placeholder_str).encode('utf-8')
|
27
|
-
metadata_size = len(placeholder_json)
|
28
|
-
|
29
|
-
# Step 3: Now calculate actual positions using this size
|
30
|
-
current_pos = 512 + metadata_size
|
31
|
-
current_pos += (512 - (current_pos % 512)) % 512
|
32
|
-
|
33
|
-
# Step 4: Calculate real positions and update original metadata (10-digit padded)
|
34
|
-
for file_num, content in enumerate(documents):
|
35
|
-
start_byte = current_pos + 512
|
36
|
-
end_byte = start_byte + len(content)
|
11
|
+
class Submission:
|
12
|
+
def __init__(self, path=None, sgml_content=None, keep_document_types=None,
|
13
|
+
batch_tar_path=None, accession_prefix=None, portfolio_ref=None):
|
37
14
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
15
|
+
# Validate parameters
|
16
|
+
param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path])
|
17
|
+
if param_count != 1:
|
18
|
+
raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
|
42
19
|
|
43
|
-
|
44
|
-
|
45
|
-
current_pos += padded_size
|
46
|
-
|
47
|
-
return metadata
|
48
|
-
|
49
|
-
|
50
|
-
def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
|
51
|
-
# Write tar directly to disk
|
52
|
-
with tarfile.open(output_path, 'w') as tar:
|
53
|
-
|
54
|
-
# calculate document locations in tar
|
55
|
-
metadata = calculate_documents_locations_in_tar(metadata, documents)
|
20
|
+
if batch_tar_path is not None and (accession_prefix is None or portfolio_ref is None):
|
21
|
+
raise ValueError("batch_tar_path requires both accession_prefix and portfolio_ref")
|
56
22
|
|
57
|
-
#
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
tarinfo = tarfile.TarInfo(name='metadata.json')
|
62
|
-
tarinfo.size = len(metadata_json)
|
63
|
-
tar.addfile(tarinfo, io.BytesIO(metadata_json))
|
64
|
-
|
65
|
-
for file_num, content in enumerate(documents, 0):
|
66
|
-
if standardize_metadata:
|
67
|
-
document_name = metadata['documents'][file_num]['filename'] if metadata['documents'][file_num].get('filename') else metadata['documents'][file_num]['sequence'] + '.txt'
|
68
|
-
|
69
|
-
compression = compression_list[file_num]
|
70
|
-
if compression == 'gzip':
|
71
|
-
document_name = f'{document_name}.gz'
|
72
|
-
elif compression == 'zstd':
|
73
|
-
document_name = f'{document_name}.zst'
|
74
|
-
|
75
|
-
|
76
|
-
tarinfo = tarfile.TarInfo(name=f'{document_name}')
|
77
|
-
tarinfo.size = len(content)
|
78
|
-
tar.addfile(tarinfo, io.BytesIO(content))
|
79
|
-
|
80
|
-
class Submission:
|
81
|
-
def __init__(self, path=None,sgml_content=None,keep_document_types=None):
|
82
|
-
if path is None and sgml_content is None:
|
83
|
-
raise ValueError("Either path or sgml_content must be provided")
|
84
|
-
if path is not None and sgml_content is not None:
|
85
|
-
raise ValueError("Only one of path or sgml_content must be provided")
|
23
|
+
# Initialize batch tar attributes
|
24
|
+
self.batch_tar_path = batch_tar_path
|
25
|
+
self.accession_prefix = accession_prefix
|
26
|
+
self.portfolio_ref = portfolio_ref
|
86
27
|
|
87
28
|
if sgml_content is not None:
|
88
29
|
self.path = None
|
@@ -100,7 +41,7 @@ class Submission:
|
|
100
41
|
filtered_metadata_documents = []
|
101
42
|
|
102
43
|
for idx,doc in enumerate(self.metadata.content['documents']):
|
103
|
-
type = doc.get('type')
|
44
|
+
type = doc.get('type')
|
104
45
|
|
105
46
|
# Keep only specified types
|
106
47
|
if keep_document_types is not None and type not in keep_document_types:
|
@@ -115,7 +56,26 @@ class Submission:
|
|
115
56
|
|
116
57
|
self.metadata.content['documents'] = filtered_metadata_documents
|
117
58
|
|
118
|
-
|
59
|
+
elif batch_tar_path is not None:
|
60
|
+
# Batch tar case
|
61
|
+
self.path = None
|
62
|
+
|
63
|
+
# Load metadata from batch tar
|
64
|
+
with self.portfolio_ref.batch_tar_locks[batch_tar_path]:
|
65
|
+
tar_handle = self.portfolio_ref.batch_tar_handles[batch_tar_path]
|
66
|
+
metadata_obj = tar_handle.extractfile(f'{accession_prefix}/metadata.json')
|
67
|
+
metadata = json.loads(metadata_obj.read().decode('utf-8'))
|
68
|
+
|
69
|
+
# Set metadata path using :: notation
|
70
|
+
metadata_path = f"{batch_tar_path}::{accession_prefix}/metadata.json"
|
71
|
+
|
72
|
+
# standardize metadata
|
73
|
+
metadata = transform_metadata_string(metadata)
|
74
|
+
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
75
|
+
self.accession = self.metadata.content['accession-number']
|
76
|
+
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
77
|
+
|
78
|
+
elif path is not None:
|
119
79
|
self.path = Path(path)
|
120
80
|
if self.path.suffix == '.tar':
|
121
81
|
with tarfile.open(self.path,'r') as tar:
|
@@ -134,135 +94,71 @@ class Submission:
|
|
134
94
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
135
95
|
self.accession = self.metadata.content['accession-number']
|
136
96
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
def compress(self, compression=None, level=None, threshold=1048576):
|
141
|
-
if self.path is None:
|
142
|
-
raise ValueError("Compress requires path")
|
143
|
-
|
144
|
-
if compression is not None and compression not in ['gzip', 'zstd']:
|
145
|
-
raise ValueError("compression must be 'gzip' or 'zstd'")
|
146
|
-
|
147
|
-
# check if we're loading from a dir or a tar file
|
148
|
-
is_dir_not_tar = True
|
149
|
-
if self.path.suffix == '.tar':
|
150
|
-
is_dir_not_tar = False
|
151
|
-
elif not self.path.is_dir():
|
152
|
-
raise ValueError("Path must be a directory to compress")
|
153
|
-
# Create tar file (replace directory with .tar file)
|
154
|
-
tar_path = self.path.with_suffix('.tar')
|
155
|
-
|
156
|
-
# load all files in the directory or tar file
|
157
|
-
documents = [doc.content.encode('utf-8') if isinstance(doc.content, str) else doc.content for doc in self]
|
158
|
-
|
159
|
-
|
160
|
-
# we should compress everything here first.
|
161
|
-
compression_list = [compression if len(doc) >= threshold else '' for doc in documents]
|
162
|
-
documents = [gzip.compress(doc, compresslevel=level or 6) if compression == 'gzip' and
|
163
|
-
len(doc) >= threshold else zstd.ZstdCompressor(level=level or 3).compress(doc) if compression == 'zstd' and
|
164
|
-
len(doc) >= threshold else doc for doc in documents]
|
165
|
-
|
166
|
-
metadata = self.metadata.content.copy()
|
167
|
-
write_submission_to_tar(tar_path,metadata,documents,compression_list=compression_list,standardize_metadata=True)
|
168
|
-
|
169
|
-
# Delete original folder
|
170
|
-
if is_dir_not_tar:
|
171
|
-
shutil.rmtree(self.path)
|
172
|
-
# otherwise, we already replaced the tar file
|
173
|
-
# Update path to point to new tar file
|
174
|
-
self.path = tar_path
|
175
|
-
|
176
|
-
def decompress(self):
|
177
|
-
if self.path is None:
|
178
|
-
raise ValueError("Decompress requires path")
|
179
|
-
elif self.path.suffix != '.tar':
|
180
|
-
raise ValueError("Can only decompress tar")
|
181
|
-
|
182
|
-
# Create output directory (path without .tar extension)
|
183
|
-
output_dir = self.path.with_suffix('')
|
184
|
-
output_dir.mkdir(exist_ok=True)
|
185
|
-
|
186
|
-
with tarfile.open(self.path, 'r') as tar:
|
187
|
-
for member in tar.getmembers():
|
188
|
-
if member.isfile():
|
189
|
-
content = tar.extractfile(member).read()
|
190
|
-
|
191
|
-
# Decompress based on file extension
|
192
|
-
if member.name.endswith('.gz'):
|
193
|
-
content = gzip.decompress(content)
|
194
|
-
output_path = output_dir / member.name[:-3] # Remove .gz extension
|
195
|
-
elif member.name.endswith('.zst'):
|
196
|
-
dctx = zstd.ZstdDecompressor()
|
197
|
-
content = dctx.decompress(content)
|
198
|
-
output_path = output_dir / member.name[:-4] # Remove .zst extension
|
199
|
-
else:
|
200
|
-
output_path = output_dir / member.name
|
201
|
-
|
202
|
-
# check if it is metadata.json
|
203
|
-
if output_path.name == 'metadata.json':
|
204
|
-
# load as json
|
205
|
-
metadata = json.loads(content.decode('utf-8'))
|
206
|
-
# remove SECSGML_START_BYTE and SECSGML_END_BYTE from documents
|
207
|
-
for doc in metadata['documents']:
|
208
|
-
if 'secsgml_start_byte' in doc:
|
209
|
-
del doc['secsgml_start_byte']
|
210
|
-
|
211
|
-
if 'secsgml_end_byte' in doc:
|
212
|
-
del doc['secsgml_end_byte']
|
213
|
-
|
214
|
-
with output_path.open('w', encoding='utf-8') as f:
|
215
|
-
json.dump(metadata, f)
|
216
|
-
else:
|
217
|
-
# Write to output directory
|
218
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
219
|
-
with output_path.open('wb') as f:
|
220
|
-
f.write(content)
|
221
|
-
|
222
|
-
# delete original file
|
223
|
-
self.path.unlink()
|
224
|
-
self.path = output_dir
|
225
97
|
|
226
98
|
def _load_document_by_index(self, idx):
|
227
99
|
"""Load a document by its index in the metadata documents list."""
|
228
100
|
doc = self.metadata.content['documents'][idx]
|
229
101
|
|
230
102
|
# If loaded from sgml_content, return pre-loaded document
|
231
|
-
if self.path is None:
|
103
|
+
if self.path is None and self.batch_tar_path is None:
|
232
104
|
return self.documents[idx]
|
233
105
|
|
234
|
-
#
|
106
|
+
# Get filename from metadata - this is the source of truth
|
235
107
|
filename = doc.get('filename')
|
236
108
|
if filename is None:
|
237
109
|
filename = doc['sequence'] + '.txt'
|
238
110
|
|
239
|
-
|
240
|
-
extension
|
111
|
+
# Get the base extension (before any compression extension)
|
112
|
+
# If filename ends with .gz or .zst, the real extension is before that
|
113
|
+
if filename.endswith('.gz'):
|
114
|
+
extension = Path(filename[:-3]).suffix
|
115
|
+
is_compressed = 'gzip'
|
116
|
+
elif filename.endswith('.zst'):
|
117
|
+
extension = Path(filename[:-4]).suffix
|
118
|
+
is_compressed = 'zstd'
|
119
|
+
else:
|
120
|
+
extension = Path(filename).suffix
|
121
|
+
is_compressed = False
|
241
122
|
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
# some of these issues are on SEC data end, will fix when I setup cloud.
|
255
|
-
raise ValueError(f"Something went wrong with tar: {self.path}")
|
256
|
-
# Decompress if compressed
|
257
|
-
if filename.endswith('.gz'):
|
123
|
+
# Handle batch tar case
|
124
|
+
if self.batch_tar_path is not None:
|
125
|
+
with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
|
126
|
+
tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
|
127
|
+
|
128
|
+
# Use exact filename from metadata
|
129
|
+
tar_path = f'{self.accession_prefix}/{filename}'
|
130
|
+
content = tar_handle.extractfile(tar_path).read()
|
131
|
+
|
132
|
+
|
133
|
+
# Decompress if needed based on filename extension
|
134
|
+
if is_compressed == 'gzip':
|
258
135
|
content = gzip.decompress(content)
|
259
|
-
elif
|
260
|
-
|
261
|
-
|
136
|
+
elif is_compressed == 'zstd':
|
137
|
+
content = zstd.ZstdDecompressor().decompress(content)
|
138
|
+
|
139
|
+
# Decode text files
|
140
|
+
if extension in ['.htm', '.html', '.txt', '.xml']:
|
141
|
+
content = content.decode('utf-8', errors='replace')
|
142
|
+
|
143
|
+
document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
|
144
|
+
|
145
|
+
# Handle regular path case
|
262
146
|
else:
|
147
|
+
# Use exact filename from metadata
|
148
|
+
document_path = self.path / filename
|
149
|
+
|
150
|
+
if not document_path.exists():
|
151
|
+
raise FileNotFoundError(f"Document file not found: {document_path}")
|
152
|
+
|
263
153
|
with document_path.open('rb') as f:
|
264
154
|
content = f.read()
|
265
|
-
|
155
|
+
|
156
|
+
# Decompress if needed based on filename extension
|
157
|
+
if is_compressed == 'gzip':
|
158
|
+
content = gzip.decompress(content)
|
159
|
+
elif is_compressed == 'zstd':
|
160
|
+
content = zstd.ZstdDecompressor().decompress(content)
|
161
|
+
|
266
162
|
# Decode text files
|
267
163
|
if extension in ['.htm', '.html', '.txt', '.xml']:
|
268
164
|
content = content.decode('utf-8', errors='replace')
|
@@ -41,9 +41,9 @@ def process_file_batch(zip_file, filenames_batch):
|
|
41
41
|
# Create filing records for this file
|
42
42
|
for j in range(len(accession_numbers)):
|
43
43
|
filing_record = {
|
44
|
-
'accessionNumber': accession_numbers[j],
|
44
|
+
'accessionNumber': int(accession_numbers[j].replace('-','')),
|
45
45
|
'filingDate': filing_dates[j],
|
46
|
-
'
|
46
|
+
'submissionType': forms[j],
|
47
47
|
'cik': cik
|
48
48
|
}
|
49
49
|
batch_filings.append(filing_record)
|
@@ -59,13 +59,13 @@ def write_csv_chunk(output_path, filings_data, is_first_write, write_lock):
|
|
59
59
|
with write_lock:
|
60
60
|
if is_first_write:
|
61
61
|
with open(output_path, 'w', newline='') as csvfile:
|
62
|
-
fieldnames = ['accessionNumber', 'filingDate', '
|
62
|
+
fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
|
63
63
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
64
64
|
writer.writeheader()
|
65
65
|
writer.writerows(filings_data)
|
66
66
|
else:
|
67
67
|
with open(output_path, 'a', newline='') as csvfile:
|
68
|
-
fieldnames = ['accessionNumber', 'filingDate', '
|
68
|
+
fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
|
69
69
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
70
70
|
writer.writerows(filings_data)
|
71
71
|
|
@@ -3,9 +3,10 @@ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
|
3
3
|
datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
|
4
4
|
datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
5
5
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
6
|
-
datamule/portfolio.py,sha256=
|
6
|
+
datamule/portfolio.py,sha256=tADqQMkFaFyjanbJ0QcaOHGdJJB254rOg29FW7a13l0,11835
|
7
|
+
datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
|
7
8
|
datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
|
8
|
-
datamule/submission.py,sha256=
|
9
|
+
datamule/submission.py,sha256=yDPglaFJ65nXn7Lxh-JFTQGKVVmBJDHBVWTf4UEUm2M,8610
|
9
10
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
10
11
|
datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
12
|
datamule/datamule/sec_connector.py,sha256=T3edE7I-d4oHysqj7zYlIOxH3Fuauj9tfw39UdFWvB8,2393
|
@@ -61,11 +62,11 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
|
|
61
62
|
datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
|
62
63
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
63
64
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
64
|
-
datamule/seclibrary/downloader.py,sha256=
|
65
|
+
datamule/seclibrary/downloader.py,sha256=3jEy67oiEg8BF20KcKCx2KC0UjHzhiepdu29TOaHWXs,17564
|
65
66
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
66
67
|
datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
67
|
-
datamule/utils/construct_submissions_data.py,sha256=
|
68
|
-
datamule-1.6.
|
69
|
-
datamule-1.6.
|
70
|
-
datamule-1.6.
|
71
|
-
datamule-1.6.
|
68
|
+
datamule/utils/construct_submissions_data.py,sha256=aX7ZaAp3zXHLcv4TFk_rGwjb8r7yNDQDFVg4nPf60kM,5934
|
69
|
+
datamule-1.6.3.dist-info/METADATA,sha256=9tb_ecnMVFHYq-Jcj_O0xAYUtM6v2PEZRxdEtPnorD4,524
|
70
|
+
datamule-1.6.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
71
|
+
datamule-1.6.3.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
72
|
+
datamule-1.6.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|