datamule 2.4.0__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamule might be problematic. Click here for more details.
- datamule/book/book.py +6 -1
- datamule/datamule/tar_downloader.py +23 -78
- datamule/submission/submission.py +5 -3
- {datamule-2.4.0.dist-info → datamule-2.4.1.dist-info}/METADATA +1 -1
- {datamule-2.4.0.dist-info → datamule-2.4.1.dist-info}/RECORD +7 -7
- {datamule-2.4.0.dist-info → datamule-2.4.1.dist-info}/WHEEL +0 -0
- {datamule-2.4.0.dist-info → datamule-2.4.1.dist-info}/top_level.txt +0 -0
datamule/book/book.py
CHANGED
|
@@ -10,4 +10,9 @@ class Book:
|
|
|
10
10
|
s3_transfer(datamule_bucket=datamule_bucket, s3_credentials=s3_credentials, max_workers=max_workers,
|
|
11
11
|
errors_json_filename=errors_json_filename, retry_errors=retry_errors,
|
|
12
12
|
force_daily=force_daily, cik=cik, submission_type=submission_type,
|
|
13
|
-
filing_date=filing_date, datamule_api_key=datamule_api_key,accession_number=accession)
|
|
13
|
+
filing_date=filing_date, datamule_api_key=datamule_api_key,accession_number=accession)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def download_filings_processed_r2():
|
|
17
|
+
pass
|
|
18
|
+
|
|
@@ -11,8 +11,7 @@ import tarfile
|
|
|
11
11
|
import logging
|
|
12
12
|
from concurrent.futures import ThreadPoolExecutor
|
|
13
13
|
from functools import partial
|
|
14
|
-
from
|
|
15
|
-
from threading import Thread, Lock
|
|
14
|
+
from threading import Lock
|
|
16
15
|
from os import cpu_count
|
|
17
16
|
from .datamule_lookup import datamule_lookup
|
|
18
17
|
from ..utils.format_accession import format_accession
|
|
@@ -37,19 +36,7 @@ class TarDownloader:
|
|
|
37
36
|
self.RANGE_MERGE_THRESHOLD = 1024 # Merge ranges if gap <= 1024 bytes
|
|
38
37
|
if api_key is not None:
|
|
39
38
|
self._api_key = api_key
|
|
40
|
-
self.loop = asyncio.new_event_loop()
|
|
41
|
-
self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
|
|
42
|
-
self.loop_thread.start()
|
|
43
|
-
self.async_queue = Queue()
|
|
44
39
|
self.error_log_lock = Lock()
|
|
45
|
-
|
|
46
|
-
def _run_event_loop(self):
|
|
47
|
-
asyncio.set_event_loop(self.loop)
|
|
48
|
-
self.loop.run_forever()
|
|
49
|
-
|
|
50
|
-
def _run_coroutine(self, coro):
|
|
51
|
-
future = asyncio.run_coroutine_threadsafe(coro, self.loop)
|
|
52
|
-
return future.result()
|
|
53
40
|
|
|
54
41
|
@property
|
|
55
42
|
def api_key(self):
|
|
@@ -286,6 +273,11 @@ class TarDownloader:
|
|
|
286
273
|
filtered.append(doc)
|
|
287
274
|
|
|
288
275
|
return filtered
|
|
276
|
+
|
|
277
|
+
def _decompress_zstd(self, compressed_content):
|
|
278
|
+
"""Decompress zstd content"""
|
|
279
|
+
dctx = zstd.ZstdDecompressor()
|
|
280
|
+
return dctx.decompress(compressed_content)
|
|
289
281
|
|
|
290
282
|
class TarManager:
|
|
291
283
|
def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
|
|
@@ -357,6 +349,8 @@ class TarDownloader:
|
|
|
357
349
|
def _parse_multipart_byteranges(self, content, content_type):
|
|
358
350
|
"""
|
|
359
351
|
Parse multipart/byteranges response.
|
|
352
|
+
Currently simplified for single-range responses.
|
|
353
|
+
Future: implement full multipart parsing when using database with multiple ranges.
|
|
360
354
|
|
|
361
355
|
Args:
|
|
362
356
|
content: Response body bytes
|
|
@@ -365,49 +359,12 @@ class TarDownloader:
|
|
|
365
359
|
Returns:
|
|
366
360
|
list of (start_byte, end_byte, data) tuples
|
|
367
361
|
"""
|
|
368
|
-
#
|
|
362
|
+
# For now, handle single range responses only
|
|
369
363
|
if 'boundary=' not in content_type:
|
|
370
|
-
# Single range response, not multipart
|
|
371
364
|
return [(None, None, content)]
|
|
372
365
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
end_boundary_bytes = f'--{boundary}--'.encode('utf-8')
|
|
376
|
-
|
|
377
|
-
parts = []
|
|
378
|
-
sections = content.split(boundary_bytes)
|
|
379
|
-
|
|
380
|
-
for section in sections[1:]: # Skip first empty section
|
|
381
|
-
if section.startswith(end_boundary_bytes) or not section.strip():
|
|
382
|
-
continue
|
|
383
|
-
|
|
384
|
-
# Split headers from body
|
|
385
|
-
header_end = section.find(b'\r\n\r\n')
|
|
386
|
-
if header_end == -1:
|
|
387
|
-
header_end = section.find(b'\n\n')
|
|
388
|
-
if header_end == -1:
|
|
389
|
-
continue
|
|
390
|
-
body_start = header_end + 2
|
|
391
|
-
else:
|
|
392
|
-
body_start = header_end + 4
|
|
393
|
-
|
|
394
|
-
headers = section[:header_end].decode('utf-8', errors='ignore')
|
|
395
|
-
body = section[body_start:].rstrip(b'\r\n')
|
|
396
|
-
|
|
397
|
-
# Parse Content-Range header
|
|
398
|
-
start_byte = None
|
|
399
|
-
end_byte = None
|
|
400
|
-
for line in headers.split('\n'):
|
|
401
|
-
if line.lower().startswith('content-range:'):
|
|
402
|
-
# Format: "Content-Range: bytes START-END/TOTAL"
|
|
403
|
-
range_part = line.split(':')[1].strip()
|
|
404
|
-
if 'bytes ' in range_part:
|
|
405
|
-
byte_range = range_part.split('bytes ')[1].split('/')[0]
|
|
406
|
-
start_byte, end_byte = map(int, byte_range.split('-'))
|
|
407
|
-
|
|
408
|
-
parts.append((start_byte, end_byte, body))
|
|
409
|
-
|
|
410
|
-
return parts
|
|
366
|
+
# TODO: Implement full multipart parsing when database returns multiple discontinuous ranges
|
|
367
|
+
return [(None, None, content)]
|
|
411
368
|
|
|
412
369
|
def extract_and_process_tar(self, tar_content, filename, tar_manager, output_dir, keep_document_types, is_partial=False):
|
|
413
370
|
"""Extract tar file and process its contents"""
|
|
@@ -422,9 +379,14 @@ class TarDownloader:
|
|
|
422
379
|
self._log_error(output_dir, filename, "No files found in partial tar")
|
|
423
380
|
return False
|
|
424
381
|
|
|
425
|
-
# First file
|
|
382
|
+
# First file is metadata (never compressed)
|
|
426
383
|
metadata_content = files[0]['content']
|
|
427
|
-
|
|
384
|
+
|
|
385
|
+
# Remaining files are documents (always compressed)
|
|
386
|
+
documents = []
|
|
387
|
+
for file in files[1:]:
|
|
388
|
+
file['content'] = self._decompress_zstd(file['content'])
|
|
389
|
+
documents.append(file)
|
|
428
390
|
|
|
429
391
|
# Build filename to type mapping from metadata
|
|
430
392
|
filename_map = self._build_filename_to_type_map(metadata_content)
|
|
@@ -452,17 +414,14 @@ class TarDownloader:
|
|
|
452
414
|
file_content = tar.extractfile(member).read()
|
|
453
415
|
|
|
454
416
|
if idx == 0:
|
|
455
|
-
# First file is
|
|
417
|
+
# First file is metadata (never compressed)
|
|
456
418
|
metadata_content = file_content
|
|
457
419
|
else:
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
# Check if file is zstd compressed
|
|
461
|
-
if self._is_zstd_compressed(file_content):
|
|
462
|
-
file_content = self._decompress_zstd(file_content)
|
|
420
|
+
# All other files are documents (always compressed)
|
|
421
|
+
file_content = self._decompress_zstd(file_content)
|
|
463
422
|
|
|
464
423
|
documents.append({
|
|
465
|
-
'name':
|
|
424
|
+
'name': os.path.basename(member.name),
|
|
466
425
|
'content': file_content
|
|
467
426
|
})
|
|
468
427
|
|
|
@@ -488,15 +447,6 @@ class TarDownloader:
|
|
|
488
447
|
except Exception as e:
|
|
489
448
|
self._log_error(output_dir, filename, f"Tar extraction error: {str(e)}")
|
|
490
449
|
return False
|
|
491
|
-
|
|
492
|
-
def _is_zstd_compressed(self, content):
|
|
493
|
-
"""Check if content is zstd compressed by magic number"""
|
|
494
|
-
return len(content) >= 4 and content[:4] == b'\x28\xb5\x2f\xfd'
|
|
495
|
-
|
|
496
|
-
def _decompress_zstd(self, compressed_content):
|
|
497
|
-
"""Decompress zstd content"""
|
|
498
|
-
dctx = zstd.ZstdDecompressor()
|
|
499
|
-
return dctx.decompress(compressed_content)
|
|
500
450
|
|
|
501
451
|
async def download_and_process(self, session, url, semaphore, extraction_pool, tar_manager, output_dir, pbar, keep_document_types, range_lookup_db=None):
|
|
502
452
|
async with semaphore:
|
|
@@ -654,10 +604,6 @@ class TarDownloader:
|
|
|
654
604
|
elapsed_time = time.time() - start_time
|
|
655
605
|
logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
|
|
656
606
|
logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
|
657
|
-
|
|
658
|
-
def __del__(self):
|
|
659
|
-
if hasattr(self, 'loop') and self.loop.is_running():
|
|
660
|
-
self.loop.call_soon_threadsafe(self.loop.stop)
|
|
661
607
|
|
|
662
608
|
def download_files_using_filename(self, filenames, output_dir="downloads", max_batch_size=1024*1024*1024, keep_document_types=[], range_lookup_db=None):
|
|
663
609
|
if self.api_key is None:
|
|
@@ -681,8 +627,7 @@ class TarDownloader:
|
|
|
681
627
|
url = f"{self.BASE_URL}{filename}"
|
|
682
628
|
urls.append(url)
|
|
683
629
|
|
|
684
|
-
|
|
685
|
-
urls = [url for url in urls if not (url in seen or seen.add(url))]
|
|
630
|
+
urls = list(set(urls))
|
|
686
631
|
|
|
687
632
|
logger.debug(f"Downloading {len(urls)} tar files...")
|
|
688
633
|
|
|
@@ -110,7 +110,7 @@ class Submission:
|
|
|
110
110
|
content_type = response.headers.get('Content-Type', '')
|
|
111
111
|
if content_type == 'application/zstd':
|
|
112
112
|
dctx = zstd.ZstdDecompressor()
|
|
113
|
-
sgml_content = dctx.decompress(sgml_content)
|
|
113
|
+
sgml_content = dctx.decompressobj().decompress(sgml_content)
|
|
114
114
|
else:
|
|
115
115
|
raise ValueError(f"URL: {url}, Error: {response.getcode()}")
|
|
116
116
|
|
|
@@ -122,7 +122,6 @@ class Submission:
|
|
|
122
122
|
metadata = transform_metadata_string(metadata)
|
|
123
123
|
|
|
124
124
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
|
|
125
|
-
|
|
126
125
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
|
127
126
|
|
|
128
127
|
self.documents_obj_list = []
|
|
@@ -341,6 +340,8 @@ class Submission:
|
|
|
341
340
|
context = xbrl_record['_context']
|
|
342
341
|
period_start_date = context.get('period_instant') or context.get('period_startdate')
|
|
343
342
|
period_end_date = context.get('period_enddate')
|
|
343
|
+
else:
|
|
344
|
+
context = None
|
|
344
345
|
|
|
345
346
|
# Create record in the format expected by construct_fundamentals
|
|
346
347
|
record = {
|
|
@@ -348,7 +349,8 @@ class Submission:
|
|
|
348
349
|
'name': name,
|
|
349
350
|
'value': value,
|
|
350
351
|
'period_start_date': period_start_date,
|
|
351
|
-
'period_end_date': period_end_date
|
|
352
|
+
'period_end_date': period_end_date,
|
|
353
|
+
'context' : context
|
|
352
354
|
}
|
|
353
355
|
|
|
354
356
|
xbrl.append(record)
|
|
@@ -5,7 +5,7 @@ datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
|
|
|
5
5
|
datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
|
6
6
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
|
7
7
|
datamule/book/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
datamule/book/book.py,sha256=
|
|
8
|
+
datamule/book/book.py,sha256=AwQUKpd3iAUbUGs2SzODIiK7aBrG2YdqwjqMp8-Fvtg,839
|
|
9
9
|
datamule/book/s3transfer.py,sha256=arftLhYThLSGvmBSNnU2rNpkqiyvwAL32OVAKP4HOAQ,12596
|
|
10
10
|
datamule/cloud/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
|
@@ -14,7 +14,7 @@ datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5
|
|
|
14
14
|
datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3SSKzTITB3o,12317
|
|
15
15
|
datamule/datamule/downloader.py,sha256=v0cG8eHZs9fttM55_ymHUWtPnCsK1aGiFTuM3jmLiCY,18650
|
|
16
16
|
datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
|
|
17
|
-
datamule/datamule/tar_downloader.py,sha256=
|
|
17
|
+
datamule/datamule/tar_downloader.py,sha256=w_HePdFJ-SjiFNLpQrFW-zn0qYjABZNRZSCO118FIgM,27326
|
|
18
18
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
19
|
datamule/document/document.py,sha256=Oj_7OMIldWB9HxlBca2gqr5E8ykDQZkPuUlcZjGuzqw,23016
|
|
20
20
|
datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -46,7 +46,7 @@ datamule/seclibrary/bq.py,sha256=TOP0WA6agDKu4vE1eHd62NDpAc02LDDrOP-g1bJpxbw,180
|
|
|
46
46
|
datamule/sheet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
47
47
|
datamule/sheet/sheet.py,sha256=Dw979JGygS566N0Iwsvqk0h1s26GfbrIHDWiBaS2oH8,10711
|
|
48
48
|
datamule/submission/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
|
-
datamule/submission/submission.py,sha256=
|
|
49
|
+
datamule/submission/submission.py,sha256=cd1SKi3fzNmvXmgbtxA7j2zc2KnFE2f68Qbta9Bnlu8,17629
|
|
50
50
|
datamule/submission/tar_submission.py,sha256=uJHyTY5G8OVqmXzb0zaBEsLNthppGqYXbW-xFM4XMok,2901
|
|
51
51
|
datamule/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
52
|
datamule/tables/tables.py,sha256=Z3Eu6bdjiaNx4pgXlTMwk2Q-DhpMpEAygF2kJdp-Pu8,5722
|
|
@@ -68,7 +68,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
68
68
|
datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
|
|
69
69
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
|
70
70
|
datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
|
|
71
|
-
datamule-2.4.
|
|
72
|
-
datamule-2.4.
|
|
73
|
-
datamule-2.4.
|
|
74
|
-
datamule-2.4.
|
|
71
|
+
datamule-2.4.1.dist-info/METADATA,sha256=61-fgRGZb-L2yINFRhsGU_ITyPxrh7RmwC_VCVIITE4,609
|
|
72
|
+
datamule-2.4.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
73
|
+
datamule-2.4.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
|
74
|
+
datamule-2.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|