PyPI - datamule - Versions diffs - 1.7.1__py3-none-any.whl → 1.8.1__py3-none-any.whl - Mend

datamule 1.7.1py3-none-any.whl → 1.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

datamule/seclibrary/downloader.py CHANGED Viewed

@@ -9,20 +9,24 @@ import zstandard as zstd
 import io
 import json
 import tarfile
+import logging
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
-from queue import Queue, Empty
+from queue import Queue
 from threading import Thread, Lock
-from .query import query
 from os import cpu_count
 from secsgml import parse_sgml_content_into_memory
 from secsgml.utils import bytes_to_str
+from .datamule_lookup import datamule_lookup
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
 class Downloader:
     def __init__(self, api_key=None):
-        self.BASE_URL = "https://library.datamule.xyz/original/nc/"
+        self.BASE_URL = "https://sec-library.datamule.xyz/"
         self.CHUNK_SIZE = 2 * 1024 * 1024
         self.MAX_CONCURRENT_DOWNLOADS = 100
         self.MAX_DECOMPRESSION_WORKERS = cpu_count()
@@ -66,7 +70,7 @@ class Downloader:
             with open(error_file, 'w') as f:
                 json.dump(errors, f, indent=2)
         except Exception as e:
-            print(f"Failed to log error to {error_file}: {str(e)}")
+            logger.error(f"Failed to log error to {error_file}: {str(e)}")
     class TarManager:
         def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
@@ -81,7 +85,7 @@ class Downloader:
             for i in range(num_tar_files):
                 tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
-                self.tar_files[i] = tarfile.open(tar_path, 'w')
+                self.tar_files[i] = tarfile.open(tar_path, 'a')
                 self.tar_locks[i] = Lock()
                 self.file_counters[i] = 0
                 self.tar_sizes[i] = 0
@@ -105,7 +109,7 @@ class Downloader:
                     self.tar_sequences[tar_index] += 1
                     new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
-                    self.tar_files[tar_index] = tarfile.open(new_tar_path, 'w')
+                    self.tar_files[tar_index] = tarfile.open(new_tar_path, 'a')
                     self.file_counters[tar_index] = 0
                     self.tar_sizes[tar_index] = 0
@@ -127,7 +131,7 @@ class Downloader:
                     return True
                 except Exception as e:
-                    print(f"Error writing {filename} to tar {tar_index}: {str(e)}")
+                    logger.error(f"Error writing {filename} to tar {tar_index}: {str(e)}")
                     return False
         def _get_document_name(self, metadata, file_num, standardize_metadata):
@@ -153,7 +157,7 @@ class Downloader:
                 try:
                     tar.close()
                 except Exception as e:
-                    print(f"Error closing tar {i}: {str(e)}")
+                    logger.error(f"Error closing tar {i}: {str(e)}")
     def decompress_and_parse_and_write(self, compressed_chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
         dctx = zstd.ZstdDecompressor()
@@ -221,17 +225,21 @@ class Downloader:
                 }
                 async with session.get(url, headers=headers) as response:
+                    content_type = response.headers.get('Content-Type', '')
                     if response.status == 200:
                         async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
                             chunks.append(chunk)
                         loop = asyncio.get_running_loop()
-                        if filename.endswith('.zst'):
+                        if content_type == 'application/zstd':
+                            logger.debug(f"Processing {filename} as compressed (zstd)")
                             success = await loop.run_in_executor(
                                 decompression_pool,
                                 partial(self.decompress_and_parse_and_write, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
                             )
                         else:
+                            logger.debug(f"Processing {filename} as uncompressed")
                             success = await loop.run_in_executor(
                                 decompression_pool,
                                 partial(self.parse_and_write_regular_file, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
@@ -293,32 +301,27 @@ class Downloader:
         if self.api_key is None:
             raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
-        print("Querying SEC filings...")
-        filings = query(
-            submission_type=submission_type,
-            cik=cik,
-            filing_date=filing_date,
-            api_key=self.api_key
-        )
+        logger.debug("Querying SEC filings...")
+        filings = datamule_lookup(cik=cik, submission_type=submission_type, filing_date=filing_date,
+                   columns=['accessionNumber'], distinct=True, page_size=25000, quiet=False)
         if accession_numbers:
             accession_numbers = [str(int(item.replace('-',''))) for item in accession_numbers]
-            filings = [filing for filing in filings if filing['accession_number'] in accession_numbers]
+            filings = [filing for filing in filings if filing['accessionNumber'] in accession_numbers]
         if skip_accession_numbers:
             skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
-            filings = [filing for filing in filings if filing['accession_number'] not in skip_accession_numbers]
+            filings = [filing for filing in filings if filing['accessionNumber'] not in skip_accession_numbers]
-        print(f"Generating URLs for {len(filings)} filings...")
+        logger.debug(f"Generating URLs for {len(filings)} filings...")
         urls = []
         for item in filings:
-            url = f"{self.BASE_URL}{str(item['accession_number']).zfill(18)}.sgml"
-            if item['compressed'] == True or item['compressed'] == 'true' or item['compressed'] == 'True':
-                url += '.zst'
+            url = f"{self.BASE_URL}{str(item['accessionNumber']).zfill(18)}.sgml"
             urls.append(url)
         if not urls:
-            print("No submissions found matching the criteria")
+            logger.warning("No submissions found matching the criteria")
             return
         urls = list(set(urls))
@@ -328,8 +331,8 @@ class Downloader:
         asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata, max_batch_size=max_batch_size))
         elapsed_time = time.time() - start_time
-        print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
-        print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
+        logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
+        logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
     def __del__(self):
         if hasattr(self, 'loop') and self.loop.is_running():
@@ -348,10 +351,10 @@ class Downloader:
         for filename in filenames:
             if not isinstance(filename, str):
                 raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
-            if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
-                raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
+            if not filename.endswith('.sgml'):
+                raise ValueError(f"Invalid filename format: {filename}. Expected .sgml extension.")
-        print(f"Generating URLs for {len(filenames)} files...")
+        logger.debug(f"Generating URLs for {len(filenames)} files...")
         urls = []
         for filename in filenames:
             url = f"{self.BASE_URL}{filename}"
@@ -360,7 +363,7 @@ class Downloader:
         seen = set()
         urls = [url for url in urls if not (url in seen or seen.add(url))]
-        print(f"Downloading {len(urls)} files...")
+        logger.debug(f"Downloading {len(urls)} files...")
         start_time = time.time()
@@ -374,12 +377,13 @@ class Downloader:
         ))
         elapsed_time = time.time() - start_time
-        print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
-        print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
+        logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
+        logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
 def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
              skip_accession_numbers=[], max_batch_size=1024*1024*1024):
     if accession_numbers:
         accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
     elif accession_numbers == []:

datamule/submission.py CHANGED Viewed

@@ -3,17 +3,21 @@ import json
 from .document.document import Document
 from secsgml import parse_sgml_content_into_memory
 from secsgml.parse_sgml import transform_metadata_string
+from secsgml.utils import bytes_to_str
+from .sec.utils import headers
 import tarfile
 import zstandard as zstd
 import gzip
+import urllib.request
 class Submission:
     def __init__(self, path=None, sgml_content=None, keep_document_types=None,
-                 batch_tar_path=None, accession_prefix=None, portfolio_ref=None):
+                 batch_tar_path=None, accession_prefix=None, portfolio_ref=None,url=None):
         # Validate parameters
-        param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path])
+        param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path,url])
         if param_count != 1:
             raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
@@ -25,9 +29,19 @@ class Submission:
         self.accession_prefix = accession_prefix
         self.portfolio_ref = portfolio_ref
-        if sgml_content is not None:
+        if url is not None or sgml_content is not None:
+            if url is not None:
+                request = urllib.request.Request(url, headers=headers)
+                response = urllib.request.urlopen(request)
+                if response.getcode() == 200:
+                    sgml_content=response.read()
+                else:
+                    raise ValueError(f"URL: {url}, Error: {response.getcode()}")
             self.path = None
             metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
+            metadata = bytes_to_str(metadata)
             # standardize metadata
             metadata = transform_metadata_string(metadata)

{datamule-1.7.1.dist-info → datamule-1.8.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.7.1
+Version: 1.8.1
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.7.1.dist-info → datamule-1.8.1.dist-info}/RECORD RENAMED Viewed

@@ -6,7 +6,7 @@ datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,9
 datamule/portfolio.py,sha256=tADqQMkFaFyjanbJ0QcaOHGdJJB254rOg29FW7a13l0,11835
 datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
 datamule/sheet.py,sha256=V5iR9_LkuwTFxfHCfzgadO6qgB6qOhzWiCAED-y8ZJQ,22744
-datamule/submission.py,sha256=ooLsesZ5HkgSWyEFID4u08CobTxdo35eAUHSCB6fw2k,10332
+datamule/submission.py,sha256=IHfEvHcLj9mrJGCNaJSMRqP9kHuJerGGM9IrN5mLDtM,10865
 datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
 datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/datamule/sec_connector.py,sha256=qCDsOgSFtfp-uz-APJjX4YrRoIGnnX-xHCL_JjLmRxk,2387
@@ -60,12 +60,12 @@ datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTq
 datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
 datamule/seclibrary/datamule_lookup.py,sha256=_opEh-DRY3ZBXFbuE2Ua_aRwoc1IsV-cPSWK0c61ofY,9465
-datamule/seclibrary/downloader.py,sha256=3jEy67oiEg8BF20KcKCx2KC0UjHzhiepdu29TOaHWXs,17564
+datamule/seclibrary/downloader.py,sha256=6cPPddjXekOwlzsyratUqzpCSbvdaNyRCGjQXUtVoJU,17930
 datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
 datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/utils/construct_submissions_data.py,sha256=aX7ZaAp3zXHLcv4TFk_rGwjb8r7yNDQDFVg4nPf60kM,5934
 datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
-datamule-1.7.1.dist-info/METADATA,sha256=DQV07IcUFX9kzNglmVZ1MvBip_cv5kiuQTKkGvWFsaQ,524
-datamule-1.7.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamule-1.7.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-1.7.1.dist-info/RECORD,,
+datamule-1.8.1.dist-info/METADATA,sha256=EANuFHyM9j25cEgk3wWP9eY1Pgb8hT2xV_g0010zpAA,524
+datamule-1.8.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamule-1.8.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-1.8.1.dist-info/RECORD,,

{datamule-1.7.1.dist-info → datamule-1.8.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-1.7.1.dist-info → datamule-1.8.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamule 1.7.1__py3-none-any.whl → 1.8.1__py3-none-any.whl

datamule 1.7.1py3-none-any.whl → 1.8.1py3-none-any.whl