datamule 2.4.0__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamule might be problematic. Click here for more details.

datamule/book/book.py CHANGED
@@ -10,4 +10,9 @@ class Book:
10
10
  s3_transfer(datamule_bucket=datamule_bucket, s3_credentials=s3_credentials, max_workers=max_workers,
11
11
  errors_json_filename=errors_json_filename, retry_errors=retry_errors,
12
12
  force_daily=force_daily, cik=cik, submission_type=submission_type,
13
- filing_date=filing_date, datamule_api_key=datamule_api_key,accession_number=accession)
13
+ filing_date=filing_date, datamule_api_key=datamule_api_key,accession_number=accession)
14
+
15
+
16
+ def download_filings_processed_r2():
17
+ pass
18
+
@@ -11,8 +11,7 @@ import tarfile
11
11
  import logging
12
12
  from concurrent.futures import ThreadPoolExecutor
13
13
  from functools import partial
14
- from queue import Queue
15
- from threading import Thread, Lock
14
+ from threading import Lock
16
15
  from os import cpu_count
17
16
  from .datamule_lookup import datamule_lookup
18
17
  from ..utils.format_accession import format_accession
@@ -37,19 +36,7 @@ class TarDownloader:
37
36
  self.RANGE_MERGE_THRESHOLD = 1024 # Merge ranges if gap <= 1024 bytes
38
37
  if api_key is not None:
39
38
  self._api_key = api_key
40
- self.loop = asyncio.new_event_loop()
41
- self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
42
- self.loop_thread.start()
43
- self.async_queue = Queue()
44
39
  self.error_log_lock = Lock()
45
-
46
- def _run_event_loop(self):
47
- asyncio.set_event_loop(self.loop)
48
- self.loop.run_forever()
49
-
50
- def _run_coroutine(self, coro):
51
- future = asyncio.run_coroutine_threadsafe(coro, self.loop)
52
- return future.result()
53
40
 
54
41
  @property
55
42
  def api_key(self):
@@ -286,6 +273,11 @@ class TarDownloader:
286
273
  filtered.append(doc)
287
274
 
288
275
  return filtered
276
+
277
+ def _decompress_zstd(self, compressed_content):
278
+ """Decompress zstd content"""
279
+ dctx = zstd.ZstdDecompressor()
280
+ return dctx.decompress(compressed_content)
289
281
 
290
282
  class TarManager:
291
283
  def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
@@ -357,6 +349,8 @@ class TarDownloader:
357
349
  def _parse_multipart_byteranges(self, content, content_type):
358
350
  """
359
351
  Parse multipart/byteranges response.
352
+ Currently simplified for single-range responses.
353
+ Future: implement full multipart parsing when using database with multiple ranges.
360
354
 
361
355
  Args:
362
356
  content: Response body bytes
@@ -365,49 +359,12 @@ class TarDownloader:
365
359
  Returns:
366
360
  list of (start_byte, end_byte, data) tuples
367
361
  """
368
- # Extract boundary from content type
362
+ # For now, handle single range responses only
369
363
  if 'boundary=' not in content_type:
370
- # Single range response, not multipart
371
364
  return [(None, None, content)]
372
365
 
373
- boundary = content_type.split('boundary=')[1].strip()
374
- boundary_bytes = f'--{boundary}'.encode('utf-8')
375
- end_boundary_bytes = f'--{boundary}--'.encode('utf-8')
376
-
377
- parts = []
378
- sections = content.split(boundary_bytes)
379
-
380
- for section in sections[1:]: # Skip first empty section
381
- if section.startswith(end_boundary_bytes) or not section.strip():
382
- continue
383
-
384
- # Split headers from body
385
- header_end = section.find(b'\r\n\r\n')
386
- if header_end == -1:
387
- header_end = section.find(b'\n\n')
388
- if header_end == -1:
389
- continue
390
- body_start = header_end + 2
391
- else:
392
- body_start = header_end + 4
393
-
394
- headers = section[:header_end].decode('utf-8', errors='ignore')
395
- body = section[body_start:].rstrip(b'\r\n')
396
-
397
- # Parse Content-Range header
398
- start_byte = None
399
- end_byte = None
400
- for line in headers.split('\n'):
401
- if line.lower().startswith('content-range:'):
402
- # Format: "Content-Range: bytes START-END/TOTAL"
403
- range_part = line.split(':')[1].strip()
404
- if 'bytes ' in range_part:
405
- byte_range = range_part.split('bytes ')[1].split('/')[0]
406
- start_byte, end_byte = map(int, byte_range.split('-'))
407
-
408
- parts.append((start_byte, end_byte, body))
409
-
410
- return parts
366
+ # TODO: Implement full multipart parsing when database returns multiple discontinuous ranges
367
+ return [(None, None, content)]
411
368
 
412
369
  def extract_and_process_tar(self, tar_content, filename, tar_manager, output_dir, keep_document_types, is_partial=False):
413
370
  """Extract tar file and process its contents"""
@@ -422,9 +379,14 @@ class TarDownloader:
422
379
  self._log_error(output_dir, filename, "No files found in partial tar")
423
380
  return False
424
381
 
425
- # First file should be metadata
382
+ # First file is metadata (never compressed)
426
383
  metadata_content = files[0]['content']
427
- documents = files[1:] if len(files) > 1 else []
384
+
385
+ # Remaining files are documents (always compressed)
386
+ documents = []
387
+ for file in files[1:]:
388
+ file['content'] = self._decompress_zstd(file['content'])
389
+ documents.append(file)
428
390
 
429
391
  # Build filename to type mapping from metadata
430
392
  filename_map = self._build_filename_to_type_map(metadata_content)
@@ -452,17 +414,14 @@ class TarDownloader:
452
414
  file_content = tar.extractfile(member).read()
453
415
 
454
416
  if idx == 0:
455
- # First file is always metadata (never compressed)
417
+ # First file is metadata (never compressed)
456
418
  metadata_content = file_content
457
419
  else:
458
- member_name = os.path.basename(member.name)
459
-
460
- # Check if file is zstd compressed
461
- if self._is_zstd_compressed(file_content):
462
- file_content = self._decompress_zstd(file_content)
420
+ # All other files are documents (always compressed)
421
+ file_content = self._decompress_zstd(file_content)
463
422
 
464
423
  documents.append({
465
- 'name': member_name,
424
+ 'name': os.path.basename(member.name),
466
425
  'content': file_content
467
426
  })
468
427
 
@@ -488,15 +447,6 @@ class TarDownloader:
488
447
  except Exception as e:
489
448
  self._log_error(output_dir, filename, f"Tar extraction error: {str(e)}")
490
449
  return False
491
-
492
- def _is_zstd_compressed(self, content):
493
- """Check if content is zstd compressed by magic number"""
494
- return len(content) >= 4 and content[:4] == b'\x28\xb5\x2f\xfd'
495
-
496
- def _decompress_zstd(self, compressed_content):
497
- """Decompress zstd content"""
498
- dctx = zstd.ZstdDecompressor()
499
- return dctx.decompress(compressed_content)
500
450
 
501
451
  async def download_and_process(self, session, url, semaphore, extraction_pool, tar_manager, output_dir, pbar, keep_document_types, range_lookup_db=None):
502
452
  async with semaphore:
@@ -654,10 +604,6 @@ class TarDownloader:
654
604
  elapsed_time = time.time() - start_time
655
605
  logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
656
606
  logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
657
-
658
- def __del__(self):
659
- if hasattr(self, 'loop') and self.loop.is_running():
660
- self.loop.call_soon_threadsafe(self.loop.stop)
661
607
 
662
608
  def download_files_using_filename(self, filenames, output_dir="downloads", max_batch_size=1024*1024*1024, keep_document_types=[], range_lookup_db=None):
663
609
  if self.api_key is None:
@@ -681,8 +627,7 @@ class TarDownloader:
681
627
  url = f"{self.BASE_URL}{filename}"
682
628
  urls.append(url)
683
629
 
684
- seen = set()
685
- urls = [url for url in urls if not (url in seen or seen.add(url))]
630
+ urls = list(set(urls))
686
631
 
687
632
  logger.debug(f"Downloading {len(urls)} tar files...")
688
633
 
@@ -110,7 +110,7 @@ class Submission:
110
110
  content_type = response.headers.get('Content-Type', '')
111
111
  if content_type == 'application/zstd':
112
112
  dctx = zstd.ZstdDecompressor()
113
- sgml_content = dctx.decompress(sgml_content)
113
+ sgml_content = dctx.decompressobj().decompress(sgml_content)
114
114
  else:
115
115
  raise ValueError(f"URL: {url}, Error: {response.getcode()}")
116
116
 
@@ -122,7 +122,6 @@ class Submission:
122
122
  metadata = transform_metadata_string(metadata)
123
123
 
124
124
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
125
-
126
125
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
127
126
 
128
127
  self.documents_obj_list = []
@@ -341,6 +340,8 @@ class Submission:
341
340
  context = xbrl_record['_context']
342
341
  period_start_date = context.get('period_instant') or context.get('period_startdate')
343
342
  period_end_date = context.get('period_enddate')
343
+ else:
344
+ context = None
344
345
 
345
346
  # Create record in the format expected by construct_fundamentals
346
347
  record = {
@@ -348,7 +349,8 @@ class Submission:
348
349
  'name': name,
349
350
  'value': value,
350
351
  'period_start_date': period_start_date,
351
- 'period_end_date': period_end_date
352
+ 'period_end_date': period_end_date,
353
+ 'context' : context
352
354
  }
353
355
 
354
356
  xbrl.append(record)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.4.0
3
+ Version: 2.4.1
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -5,7 +5,7 @@ datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
5
5
  datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
6
6
  datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
7
7
  datamule/book/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- datamule/book/book.py,sha256=Vw33JHhmulNDWRN2AQpUQrf8wgVqqUYg5QJgbKhBNak,773
8
+ datamule/book/book.py,sha256=AwQUKpd3iAUbUGs2SzODIiK7aBrG2YdqwjqMp8-Fvtg,839
9
9
  datamule/book/s3transfer.py,sha256=arftLhYThLSGvmBSNnU2rNpkqiyvwAL32OVAKP4HOAQ,12596
10
10
  datamule/cloud/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
@@ -14,7 +14,7 @@ datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5
14
14
  datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3SSKzTITB3o,12317
15
15
  datamule/datamule/downloader.py,sha256=v0cG8eHZs9fttM55_ymHUWtPnCsK1aGiFTuM3jmLiCY,18650
16
16
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
17
- datamule/datamule/tar_downloader.py,sha256=5lHbk96MxtNVeuY1_uSAWj3tt5RqgOgvAr_7qQqbJmc,29483
17
+ datamule/datamule/tar_downloader.py,sha256=w_HePdFJ-SjiFNLpQrFW-zn0qYjABZNRZSCO118FIgM,27326
18
18
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  datamule/document/document.py,sha256=Oj_7OMIldWB9HxlBca2gqr5E8ykDQZkPuUlcZjGuzqw,23016
20
20
  datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -46,7 +46,7 @@ datamule/seclibrary/bq.py,sha256=TOP0WA6agDKu4vE1eHd62NDpAc02LDDrOP-g1bJpxbw,180
46
46
  datamule/sheet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  datamule/sheet/sheet.py,sha256=Dw979JGygS566N0Iwsvqk0h1s26GfbrIHDWiBaS2oH8,10711
48
48
  datamule/submission/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
- datamule/submission/submission.py,sha256=JCGyfEVqaf8ct6h9h8WjK2zBnhg0lx9kKLud3nvJ2Eg,17516
49
+ datamule/submission/submission.py,sha256=cd1SKi3fzNmvXmgbtxA7j2zc2KnFE2f68Qbta9Bnlu8,17629
50
50
  datamule/submission/tar_submission.py,sha256=uJHyTY5G8OVqmXzb0zaBEsLNthppGqYXbW-xFM4XMok,2901
51
51
  datamule/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
52
  datamule/tables/tables.py,sha256=Z3Eu6bdjiaNx4pgXlTMwk2Q-DhpMpEAygF2kJdp-Pu8,5722
@@ -68,7 +68,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
68
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
69
69
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
70
70
  datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
71
- datamule-2.4.0.dist-info/METADATA,sha256=RSPqBwCagQnA41rQezMptrqFwnD0o65Fs74uGu12OlA,609
72
- datamule-2.4.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
73
- datamule-2.4.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
74
- datamule-2.4.0.dist-info/RECORD,,
71
+ datamule-2.4.1.dist-info/METADATA,sha256=61-fgRGZb-L2yINFRhsGU_ITyPxrh7RmwC_VCVIITE4,609
72
+ datamule-2.4.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
73
+ datamule-2.4.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
74
+ datamule-2.4.1.dist-info/RECORD,,