PyPI - datamule - Versions diffs - 2.4.0__py3-none-any.whl → 2.4.1__py3-none-any.whl - Mend

datamule 2.4.0py3-none-any.whl → 2.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datamule might be problematic. Click here for more details.

Files changed (7) hide show

datamule/book/book.py CHANGED Viewed

@@ -10,4 +10,9 @@ class Book:
         s3_transfer(datamule_bucket=datamule_bucket, s3_credentials=s3_credentials, max_workers=max_workers,
                           errors_json_filename=errors_json_filename, retry_errors=retry_errors,
                           force_daily=force_daily, cik=cik, submission_type=submission_type,
-                          filing_date=filing_date, datamule_api_key=datamule_api_key,accession_number=accession)
+                          filing_date=filing_date, datamule_api_key=datamule_api_key,accession_number=accession)
+    def download_filings_processed_r2():
+        pass

datamule/datamule/tar_downloader.py CHANGED Viewed

@@ -11,8 +11,7 @@ import tarfile
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
-from queue import Queue
-from threading import Thread, Lock
+from threading import Lock
 from os import cpu_count
 from .datamule_lookup import datamule_lookup
 from ..utils.format_accession import format_accession
@@ -37,19 +36,7 @@ class TarDownloader:
         self.RANGE_MERGE_THRESHOLD = 1024  # Merge ranges if gap <= 1024 bytes
         if api_key is not None:
             self._api_key = api_key
-        self.loop = asyncio.new_event_loop()
-        self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
-        self.loop_thread.start()
-        self.async_queue = Queue()
         self.error_log_lock = Lock()
-    def _run_event_loop(self):
-        asyncio.set_event_loop(self.loop)
-        self.loop.run_forever()
-    def _run_coroutine(self, coro):
-        future = asyncio.run_coroutine_threadsafe(coro, self.loop)
-        return future.result()
     @property
     def api_key(self):
@@ -286,6 +273,11 @@ class TarDownloader:
                 filtered.append(doc)
         return filtered
+    def _decompress_zstd(self, compressed_content):
+        """Decompress zstd content"""
+        dctx = zstd.ZstdDecompressor()
+        return dctx.decompress(compressed_content)
     class TarManager:
         def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
@@ -357,6 +349,8 @@ class TarDownloader:
     def _parse_multipart_byteranges(self, content, content_type):
         """
         Parse multipart/byteranges response.
+        Currently simplified for single-range responses.
+        Future: implement full multipart parsing when using database with multiple ranges.
         Args:
             content: Response body bytes
@@ -365,49 +359,12 @@ class TarDownloader:
         Returns:
             list of (start_byte, end_byte, data) tuples
         """
-        # Extract boundary from content type
+        # For now, handle single range responses only
         if 'boundary=' not in content_type:
-            # Single range response, not multipart
             return [(None, None, content)]
-        boundary = content_type.split('boundary=')[1].strip()
-        boundary_bytes = f'--{boundary}'.encode('utf-8')
-        end_boundary_bytes = f'--{boundary}--'.encode('utf-8')
-        parts = []
-        sections = content.split(boundary_bytes)
-        for section in sections[1:]:  # Skip first empty section
-            if section.startswith(end_boundary_bytes) or not section.strip():
-                continue
-            # Split headers from body
-            header_end = section.find(b'\r\n\r\n')
-            if header_end == -1:
-                header_end = section.find(b'\n\n')
-                if header_end == -1:
-                    continue
-                body_start = header_end + 2
-            else:
-                body_start = header_end + 4
-            headers = section[:header_end].decode('utf-8', errors='ignore')
-            body = section[body_start:].rstrip(b'\r\n')
-            # Parse Content-Range header
-            start_byte = None
-            end_byte = None
-            for line in headers.split('\n'):
-                if line.lower().startswith('content-range:'):
-                    # Format: "Content-Range: bytes START-END/TOTAL"
-                    range_part = line.split(':')[1].strip()
-                    if 'bytes ' in range_part:
-                        byte_range = range_part.split('bytes ')[1].split('/')[0]
-                        start_byte, end_byte = map(int, byte_range.split('-'))
-            parts.append((start_byte, end_byte, body))
-        return parts
+        # TODO: Implement full multipart parsing when database returns multiple discontinuous ranges
+        return [(None, None, content)]
     def extract_and_process_tar(self, tar_content, filename, tar_manager, output_dir, keep_document_types, is_partial=False):
         """Extract tar file and process its contents"""
@@ -422,9 +379,14 @@ class TarDownloader:
                     self._log_error(output_dir, filename, "No files found in partial tar")
                     return False
-                # First file should be metadata
+                # First file is metadata (never compressed)
                 metadata_content = files[0]['content']
-                documents = files[1:] if len(files) > 1 else []
+                # Remaining files are documents (always compressed)
+                documents = []
+                for file in files[1:]:
+                    file['content'] = self._decompress_zstd(file['content'])
+                    documents.append(file)
                 # Build filename to type mapping from metadata
                 filename_map = self._build_filename_to_type_map(metadata_content)
@@ -452,17 +414,14 @@ class TarDownloader:
                             file_content = tar.extractfile(member).read()
                             if idx == 0:
-                                # First file is always metadata (never compressed)
+                                # First file is metadata (never compressed)
                                 metadata_content = file_content
                             else:
-                                member_name = os.path.basename(member.name)
-                                # Check if file is zstd compressed
-                                if self._is_zstd_compressed(file_content):
-                                    file_content = self._decompress_zstd(file_content)
+                                # All other files are documents (always compressed)
+                                file_content = self._decompress_zstd(file_content)
                                 documents.append({
-                                    'name': member_name,
+                                    'name': os.path.basename(member.name),
                                     'content': file_content
                                 })
@@ -488,15 +447,6 @@ class TarDownloader:
         except Exception as e:
             self._log_error(output_dir, filename, f"Tar extraction error: {str(e)}")
             return False
-    def _is_zstd_compressed(self, content):
-        """Check if content is zstd compressed by magic number"""
-        return len(content) >= 4 and content[:4] == b'\x28\xb5\x2f\xfd'
-    def _decompress_zstd(self, compressed_content):
-        """Decompress zstd content"""
-        dctx = zstd.ZstdDecompressor()
-        return dctx.decompress(compressed_content)
     async def download_and_process(self, session, url, semaphore, extraction_pool, tar_manager, output_dir, pbar, keep_document_types, range_lookup_db=None):
         async with semaphore:
@@ -654,10 +604,6 @@ class TarDownloader:
         elapsed_time = time.time() - start_time
         logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
         logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
-    def __del__(self):
-        if hasattr(self, 'loop') and self.loop.is_running():
-            self.loop.call_soon_threadsafe(self.loop.stop)
     def download_files_using_filename(self, filenames, output_dir="downloads", max_batch_size=1024*1024*1024, keep_document_types=[], range_lookup_db=None):
         if self.api_key is None:
@@ -681,8 +627,7 @@ class TarDownloader:
             url = f"{self.BASE_URL}{filename}"
             urls.append(url)
-        seen = set()
-        urls = [url for url in urls if not (url in seen or seen.add(url))]
+        urls = list(set(urls))
         logger.debug(f"Downloading {len(urls)} tar files...")

datamule/submission/submission.py CHANGED Viewed

@@ -110,7 +110,7 @@ class Submission:
                     content_type = response.headers.get('Content-Type', '')
                     if content_type == 'application/zstd':
                         dctx = zstd.ZstdDecompressor()
-                        sgml_content = dctx.decompress(sgml_content)
+                        sgml_content = dctx.decompressobj().decompress(sgml_content)
                 else:
                     raise ValueError(f"URL: {url}, Error: {response.getcode()}")
@@ -122,7 +122,6 @@ class Submission:
             metadata = transform_metadata_string(metadata)
             self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
             self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
             self.documents_obj_list = []
@@ -341,6 +340,8 @@ class Submission:
                     context = xbrl_record['_context']
                     period_start_date = context.get('period_instant') or context.get('period_startdate')
                     period_end_date = context.get('period_enddate')
+                else:
+                    context = None
                 # Create record in the format expected by construct_fundamentals
                 record = {
@@ -348,7 +349,8 @@ class Submission:
                     'name': name,
                     'value': value,
                     'period_start_date': period_start_date,
-                    'period_end_date': period_end_date
+                    'period_end_date': period_end_date,
+                    'context' : context
                 }
                 xbrl.append(record)

{datamule-2.4.0.dist-info → datamule-2.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.4.0
+Version: 2.4.1
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.4.0.dist-info → datamule-2.4.1.dist-info}/RECORD RENAMED Viewed

@@ -5,7 +5,7 @@ datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
 datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
 datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
 datamule/book/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/book/book.py,sha256=Vw33JHhmulNDWRN2AQpUQrf8wgVqqUYg5QJgbKhBNak,773
+datamule/book/book.py,sha256=AwQUKpd3iAUbUGs2SzODIiK7aBrG2YdqwjqMp8-Fvtg,839
 datamule/book/s3transfer.py,sha256=arftLhYThLSGvmBSNnU2rNpkqiyvwAL32OVAKP4HOAQ,12596
 datamule/cloud/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
@@ -14,7 +14,7 @@ datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5
 datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3SSKzTITB3o,12317
 datamule/datamule/downloader.py,sha256=v0cG8eHZs9fttM55_ymHUWtPnCsK1aGiFTuM3jmLiCY,18650
 datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
-datamule/datamule/tar_downloader.py,sha256=5lHbk96MxtNVeuY1_uSAWj3tt5RqgOgvAr_7qQqbJmc,29483
+datamule/datamule/tar_downloader.py,sha256=w_HePdFJ-SjiFNLpQrFW-zn0qYjABZNRZSCO118FIgM,27326
 datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/document/document.py,sha256=Oj_7OMIldWB9HxlBca2gqr5E8ykDQZkPuUlcZjGuzqw,23016
 datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -46,7 +46,7 @@ datamule/seclibrary/bq.py,sha256=TOP0WA6agDKu4vE1eHd62NDpAc02LDDrOP-g1bJpxbw,180
 datamule/sheet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/sheet/sheet.py,sha256=Dw979JGygS566N0Iwsvqk0h1s26GfbrIHDWiBaS2oH8,10711
 datamule/submission/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/submission/submission.py,sha256=JCGyfEVqaf8ct6h9h8WjK2zBnhg0lx9kKLud3nvJ2Eg,17516
+datamule/submission/submission.py,sha256=cd1SKi3fzNmvXmgbtxA7j2zc2KnFE2f68Qbta9Bnlu8,17629
 datamule/submission/tar_submission.py,sha256=uJHyTY5G8OVqmXzb0zaBEsLNthppGqYXbW-xFM4XMok,2901
 datamule/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/tables/tables.py,sha256=Z3Eu6bdjiaNx4pgXlTMwk2Q-DhpMpEAygF2kJdp-Pu8,5722
@@ -68,7 +68,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
 datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
 datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
-datamule-2.4.0.dist-info/METADATA,sha256=RSPqBwCagQnA41rQezMptrqFwnD0o65Fs74uGu12OlA,609
-datamule-2.4.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamule-2.4.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-2.4.0.dist-info/RECORD,,
+datamule-2.4.1.dist-info/METADATA,sha256=61-fgRGZb-L2yINFRhsGU_ITyPxrh7RmwC_VCVIITE4,609
+datamule-2.4.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamule-2.4.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-2.4.1.dist-info/RECORD,,

{datamule-2.4.0.dist-info → datamule-2.4.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-2.4.0.dist-info → datamule-2.4.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamule 2.4.0__py3-none-any.whl → 2.4.1__py3-none-any.whl

Potentially problematic release.

datamule 2.4.0py3-none-any.whl → 2.4.1py3-none-any.whl