datamule 1.7.1__py3-none-any.whl → 1.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/seclibrary/downloader.py +35 -31
- datamule/submission.py +17 -3
- {datamule-1.7.1.dist-info → datamule-1.8.1.dist-info}/METADATA +1 -1
- {datamule-1.7.1.dist-info → datamule-1.8.1.dist-info}/RECORD +6 -6
- {datamule-1.7.1.dist-info → datamule-1.8.1.dist-info}/WHEEL +0 -0
- {datamule-1.7.1.dist-info → datamule-1.8.1.dist-info}/top_level.txt +0 -0
@@ -9,20 +9,24 @@ import zstandard as zstd
|
|
9
9
|
import io
|
10
10
|
import json
|
11
11
|
import tarfile
|
12
|
+
import logging
|
12
13
|
from concurrent.futures import ThreadPoolExecutor
|
13
14
|
from functools import partial
|
14
|
-
from queue import Queue
|
15
|
+
from queue import Queue
|
15
16
|
from threading import Thread, Lock
|
16
|
-
from .query import query
|
17
17
|
from os import cpu_count
|
18
18
|
from secsgml import parse_sgml_content_into_memory
|
19
19
|
from secsgml.utils import bytes_to_str
|
20
|
+
from .datamule_lookup import datamule_lookup
|
20
21
|
|
22
|
+
# Set up logging
|
23
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
24
|
+
logger = logging.getLogger(__name__)
|
21
25
|
|
22
26
|
|
23
27
|
class Downloader:
|
24
28
|
def __init__(self, api_key=None):
|
25
|
-
self.BASE_URL = "https://library.datamule.xyz/
|
29
|
+
self.BASE_URL = "https://sec-library.datamule.xyz/"
|
26
30
|
self.CHUNK_SIZE = 2 * 1024 * 1024
|
27
31
|
self.MAX_CONCURRENT_DOWNLOADS = 100
|
28
32
|
self.MAX_DECOMPRESSION_WORKERS = cpu_count()
|
@@ -66,7 +70,7 @@ class Downloader:
|
|
66
70
|
with open(error_file, 'w') as f:
|
67
71
|
json.dump(errors, f, indent=2)
|
68
72
|
except Exception as e:
|
69
|
-
|
73
|
+
logger.error(f"Failed to log error to {error_file}: {str(e)}")
|
70
74
|
|
71
75
|
class TarManager:
|
72
76
|
def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
|
@@ -81,7 +85,7 @@ class Downloader:
|
|
81
85
|
|
82
86
|
for i in range(num_tar_files):
|
83
87
|
tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
|
84
|
-
self.tar_files[i] = tarfile.open(tar_path, '
|
88
|
+
self.tar_files[i] = tarfile.open(tar_path, 'a')
|
85
89
|
self.tar_locks[i] = Lock()
|
86
90
|
self.file_counters[i] = 0
|
87
91
|
self.tar_sizes[i] = 0
|
@@ -105,7 +109,7 @@ class Downloader:
|
|
105
109
|
|
106
110
|
self.tar_sequences[tar_index] += 1
|
107
111
|
new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
|
108
|
-
self.tar_files[tar_index] = tarfile.open(new_tar_path, '
|
112
|
+
self.tar_files[tar_index] = tarfile.open(new_tar_path, 'a')
|
109
113
|
self.file_counters[tar_index] = 0
|
110
114
|
self.tar_sizes[tar_index] = 0
|
111
115
|
|
@@ -127,7 +131,7 @@ class Downloader:
|
|
127
131
|
return True
|
128
132
|
|
129
133
|
except Exception as e:
|
130
|
-
|
134
|
+
logger.error(f"Error writing {filename} to tar {tar_index}: {str(e)}")
|
131
135
|
return False
|
132
136
|
|
133
137
|
def _get_document_name(self, metadata, file_num, standardize_metadata):
|
@@ -153,7 +157,7 @@ class Downloader:
|
|
153
157
|
try:
|
154
158
|
tar.close()
|
155
159
|
except Exception as e:
|
156
|
-
|
160
|
+
logger.error(f"Error closing tar {i}: {str(e)}")
|
157
161
|
|
158
162
|
def decompress_and_parse_and_write(self, compressed_chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
|
159
163
|
dctx = zstd.ZstdDecompressor()
|
@@ -221,17 +225,21 @@ class Downloader:
|
|
221
225
|
}
|
222
226
|
|
223
227
|
async with session.get(url, headers=headers) as response:
|
228
|
+
content_type = response.headers.get('Content-Type', '')
|
229
|
+
|
224
230
|
if response.status == 200:
|
225
231
|
async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
|
226
232
|
chunks.append(chunk)
|
227
233
|
|
228
234
|
loop = asyncio.get_running_loop()
|
229
|
-
if
|
235
|
+
if content_type == 'application/zstd':
|
236
|
+
logger.debug(f"Processing {filename} as compressed (zstd)")
|
230
237
|
success = await loop.run_in_executor(
|
231
238
|
decompression_pool,
|
232
239
|
partial(self.decompress_and_parse_and_write, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
|
233
240
|
)
|
234
241
|
else:
|
242
|
+
logger.debug(f"Processing {filename} as uncompressed")
|
235
243
|
success = await loop.run_in_executor(
|
236
244
|
decompression_pool,
|
237
245
|
partial(self.parse_and_write_regular_file, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
|
@@ -293,32 +301,27 @@ class Downloader:
|
|
293
301
|
if self.api_key is None:
|
294
302
|
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
295
303
|
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
filing_date=filing_date,
|
301
|
-
api_key=self.api_key
|
302
|
-
)
|
304
|
+
logger.debug("Querying SEC filings...")
|
305
|
+
|
306
|
+
filings = datamule_lookup(cik=cik, submission_type=submission_type, filing_date=filing_date,
|
307
|
+
columns=['accessionNumber'], distinct=True, page_size=25000, quiet=False)
|
303
308
|
|
304
309
|
if accession_numbers:
|
305
310
|
accession_numbers = [str(int(item.replace('-',''))) for item in accession_numbers]
|
306
|
-
filings = [filing for filing in filings if filing['
|
311
|
+
filings = [filing for filing in filings if filing['accessionNumber'] in accession_numbers]
|
307
312
|
|
308
313
|
if skip_accession_numbers:
|
309
314
|
skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
|
310
|
-
filings = [filing for filing in filings if filing['
|
315
|
+
filings = [filing for filing in filings if filing['accessionNumber'] not in skip_accession_numbers]
|
311
316
|
|
312
|
-
|
317
|
+
logger.debug(f"Generating URLs for {len(filings)} filings...")
|
313
318
|
urls = []
|
314
319
|
for item in filings:
|
315
|
-
url = f"{self.BASE_URL}{str(item['
|
316
|
-
if item['compressed'] == True or item['compressed'] == 'true' or item['compressed'] == 'True':
|
317
|
-
url += '.zst'
|
320
|
+
url = f"{self.BASE_URL}{str(item['accessionNumber']).zfill(18)}.sgml"
|
318
321
|
urls.append(url)
|
319
322
|
|
320
323
|
if not urls:
|
321
|
-
|
324
|
+
logger.warning("No submissions found matching the criteria")
|
322
325
|
return
|
323
326
|
|
324
327
|
urls = list(set(urls))
|
@@ -328,8 +331,8 @@ class Downloader:
|
|
328
331
|
asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata, max_batch_size=max_batch_size))
|
329
332
|
|
330
333
|
elapsed_time = time.time() - start_time
|
331
|
-
|
332
|
-
|
334
|
+
logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
|
335
|
+
logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
333
336
|
|
334
337
|
def __del__(self):
|
335
338
|
if hasattr(self, 'loop') and self.loop.is_running():
|
@@ -348,10 +351,10 @@ class Downloader:
|
|
348
351
|
for filename in filenames:
|
349
352
|
if not isinstance(filename, str):
|
350
353
|
raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
|
351
|
-
if not
|
352
|
-
raise ValueError(f"Invalid filename format: {filename}. Expected .sgml
|
354
|
+
if not filename.endswith('.sgml'):
|
355
|
+
raise ValueError(f"Invalid filename format: {filename}. Expected .sgml extension.")
|
353
356
|
|
354
|
-
|
357
|
+
logger.debug(f"Generating URLs for {len(filenames)} files...")
|
355
358
|
urls = []
|
356
359
|
for filename in filenames:
|
357
360
|
url = f"{self.BASE_URL}{filename}"
|
@@ -360,7 +363,7 @@ class Downloader:
|
|
360
363
|
seen = set()
|
361
364
|
urls = [url for url in urls if not (url in seen or seen.add(url))]
|
362
365
|
|
363
|
-
|
366
|
+
logger.debug(f"Downloading {len(urls)} files...")
|
364
367
|
|
365
368
|
start_time = time.time()
|
366
369
|
|
@@ -374,12 +377,13 @@ class Downloader:
|
|
374
377
|
))
|
375
378
|
|
376
379
|
elapsed_time = time.time() - start_time
|
377
|
-
|
378
|
-
|
380
|
+
logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
|
381
|
+
logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
379
382
|
|
380
383
|
|
381
384
|
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
|
382
385
|
skip_accession_numbers=[], max_batch_size=1024*1024*1024):
|
386
|
+
|
383
387
|
if accession_numbers:
|
384
388
|
accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
|
385
389
|
elif accession_numbers == []:
|
datamule/submission.py
CHANGED
@@ -3,17 +3,21 @@ import json
|
|
3
3
|
from .document.document import Document
|
4
4
|
from secsgml import parse_sgml_content_into_memory
|
5
5
|
from secsgml.parse_sgml import transform_metadata_string
|
6
|
+
from secsgml.utils import bytes_to_str
|
7
|
+
from .sec.utils import headers
|
6
8
|
import tarfile
|
7
9
|
import zstandard as zstd
|
8
10
|
import gzip
|
11
|
+
import urllib.request
|
12
|
+
|
9
13
|
|
10
14
|
|
11
15
|
class Submission:
|
12
16
|
def __init__(self, path=None, sgml_content=None, keep_document_types=None,
|
13
|
-
batch_tar_path=None, accession_prefix=None, portfolio_ref=None):
|
17
|
+
batch_tar_path=None, accession_prefix=None, portfolio_ref=None,url=None):
|
14
18
|
|
15
19
|
# Validate parameters
|
16
|
-
param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path])
|
20
|
+
param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path,url])
|
17
21
|
if param_count != 1:
|
18
22
|
raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
|
19
23
|
|
@@ -25,9 +29,19 @@ class Submission:
|
|
25
29
|
self.accession_prefix = accession_prefix
|
26
30
|
self.portfolio_ref = portfolio_ref
|
27
31
|
|
28
|
-
if sgml_content is not None:
|
32
|
+
if url is not None or sgml_content is not None:
|
33
|
+
if url is not None:
|
34
|
+
request = urllib.request.Request(url, headers=headers)
|
35
|
+
response = urllib.request.urlopen(request)
|
36
|
+
|
37
|
+
if response.getcode() == 200:
|
38
|
+
sgml_content=response.read()
|
39
|
+
else:
|
40
|
+
raise ValueError(f"URL: {url}, Error: {response.getcode()}")
|
41
|
+
|
29
42
|
self.path = None
|
30
43
|
metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
|
44
|
+
metadata = bytes_to_str(metadata)
|
31
45
|
|
32
46
|
# standardize metadata
|
33
47
|
metadata = transform_metadata_string(metadata)
|
@@ -6,7 +6,7 @@ datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,9
|
|
6
6
|
datamule/portfolio.py,sha256=tADqQMkFaFyjanbJ0QcaOHGdJJB254rOg29FW7a13l0,11835
|
7
7
|
datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
|
8
8
|
datamule/sheet.py,sha256=V5iR9_LkuwTFxfHCfzgadO6qgB6qOhzWiCAED-y8ZJQ,22744
|
9
|
-
datamule/submission.py,sha256=
|
9
|
+
datamule/submission.py,sha256=IHfEvHcLj9mrJGCNaJSMRqP9kHuJerGGM9IrN5mLDtM,10865
|
10
10
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
11
11
|
datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
datamule/datamule/sec_connector.py,sha256=qCDsOgSFtfp-uz-APJjX4YrRoIGnnX-xHCL_JjLmRxk,2387
|
@@ -60,12 +60,12 @@ datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTq
|
|
60
60
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
61
61
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
62
62
|
datamule/seclibrary/datamule_lookup.py,sha256=_opEh-DRY3ZBXFbuE2Ua_aRwoc1IsV-cPSWK0c61ofY,9465
|
63
|
-
datamule/seclibrary/downloader.py,sha256=
|
63
|
+
datamule/seclibrary/downloader.py,sha256=6cPPddjXekOwlzsyratUqzpCSbvdaNyRCGjQXUtVoJU,17930
|
64
64
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
65
65
|
datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
66
66
|
datamule/utils/construct_submissions_data.py,sha256=aX7ZaAp3zXHLcv4TFk_rGwjb8r7yNDQDFVg4nPf60kM,5934
|
67
67
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
68
|
-
datamule-1.
|
69
|
-
datamule-1.
|
70
|
-
datamule-1.
|
71
|
-
datamule-1.
|
68
|
+
datamule-1.8.1.dist-info/METADATA,sha256=EANuFHyM9j25cEgk3wWP9eY1Pgb8hT2xV_g0010zpAA,524
|
69
|
+
datamule-1.8.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
70
|
+
datamule-1.8.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
71
|
+
datamule-1.8.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|