datamule 1.7.1__py3-none-any.whl → 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,20 +9,24 @@ import zstandard as zstd
9
9
  import io
10
10
  import json
11
11
  import tarfile
12
+ import logging
12
13
  from concurrent.futures import ThreadPoolExecutor
13
14
  from functools import partial
14
- from queue import Queue, Empty
15
+ from queue import Queue
15
16
  from threading import Thread, Lock
16
- from .query import query
17
17
  from os import cpu_count
18
18
  from secsgml import parse_sgml_content_into_memory
19
19
  from secsgml.utils import bytes_to_str
20
+ from .datamule_lookup import datamule_lookup
20
21
 
22
+ # Set up logging
23
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
+ logger = logging.getLogger(__name__)
21
25
 
22
26
 
23
27
  class Downloader:
24
28
  def __init__(self, api_key=None):
25
- self.BASE_URL = "https://library.datamule.xyz/original/nc/"
29
+ self.BASE_URL = "https://sec-library.datamule.xyz/"
26
30
  self.CHUNK_SIZE = 2 * 1024 * 1024
27
31
  self.MAX_CONCURRENT_DOWNLOADS = 100
28
32
  self.MAX_DECOMPRESSION_WORKERS = cpu_count()
@@ -66,7 +70,7 @@ class Downloader:
66
70
  with open(error_file, 'w') as f:
67
71
  json.dump(errors, f, indent=2)
68
72
  except Exception as e:
69
- print(f"Failed to log error to {error_file}: {str(e)}")
73
+ logger.error(f"Failed to log error to {error_file}: {str(e)}")
70
74
 
71
75
  class TarManager:
72
76
  def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
@@ -81,7 +85,7 @@ class Downloader:
81
85
 
82
86
  for i in range(num_tar_files):
83
87
  tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
84
- self.tar_files[i] = tarfile.open(tar_path, 'w')
88
+ self.tar_files[i] = tarfile.open(tar_path, 'a')
85
89
  self.tar_locks[i] = Lock()
86
90
  self.file_counters[i] = 0
87
91
  self.tar_sizes[i] = 0
@@ -105,7 +109,7 @@ class Downloader:
105
109
 
106
110
  self.tar_sequences[tar_index] += 1
107
111
  new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
108
- self.tar_files[tar_index] = tarfile.open(new_tar_path, 'w')
112
+ self.tar_files[tar_index] = tarfile.open(new_tar_path, 'a')
109
113
  self.file_counters[tar_index] = 0
110
114
  self.tar_sizes[tar_index] = 0
111
115
 
@@ -127,7 +131,7 @@ class Downloader:
127
131
  return True
128
132
 
129
133
  except Exception as e:
130
- print(f"Error writing {filename} to tar {tar_index}: {str(e)}")
134
+ logger.error(f"Error writing {filename} to tar {tar_index}: {str(e)}")
131
135
  return False
132
136
 
133
137
  def _get_document_name(self, metadata, file_num, standardize_metadata):
@@ -153,7 +157,7 @@ class Downloader:
153
157
  try:
154
158
  tar.close()
155
159
  except Exception as e:
156
- print(f"Error closing tar {i}: {str(e)}")
160
+ logger.error(f"Error closing tar {i}: {str(e)}")
157
161
 
158
162
  def decompress_and_parse_and_write(self, compressed_chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
159
163
  dctx = zstd.ZstdDecompressor()
@@ -221,17 +225,21 @@ class Downloader:
221
225
  }
222
226
 
223
227
  async with session.get(url, headers=headers) as response:
228
+ content_type = response.headers.get('Content-Type', '')
229
+
224
230
  if response.status == 200:
225
231
  async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
226
232
  chunks.append(chunk)
227
233
 
228
234
  loop = asyncio.get_running_loop()
229
- if filename.endswith('.zst'):
235
+ if content_type == 'application/zstd':
236
+ logger.debug(f"Processing {filename} as compressed (zstd)")
230
237
  success = await loop.run_in_executor(
231
238
  decompression_pool,
232
239
  partial(self.decompress_and_parse_and_write, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
233
240
  )
234
241
  else:
242
+ logger.debug(f"Processing {filename} as uncompressed")
235
243
  success = await loop.run_in_executor(
236
244
  decompression_pool,
237
245
  partial(self.parse_and_write_regular_file, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
@@ -293,32 +301,27 @@ class Downloader:
293
301
  if self.api_key is None:
294
302
  raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
295
303
 
296
- print("Querying SEC filings...")
297
- filings = query(
298
- submission_type=submission_type,
299
- cik=cik,
300
- filing_date=filing_date,
301
- api_key=self.api_key
302
- )
304
+ logger.debug("Querying SEC filings...")
305
+
306
+ filings = datamule_lookup(cik=cik, submission_type=submission_type, filing_date=filing_date,
307
+ columns=['accessionNumber'], distinct=True, page_size=25000, quiet=False)
303
308
 
304
309
  if accession_numbers:
305
310
  accession_numbers = [str(int(item.replace('-',''))) for item in accession_numbers]
306
- filings = [filing for filing in filings if filing['accession_number'] in accession_numbers]
311
+ filings = [filing for filing in filings if filing['accessionNumber'] in accession_numbers]
307
312
 
308
313
  if skip_accession_numbers:
309
314
  skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
310
- filings = [filing for filing in filings if filing['accession_number'] not in skip_accession_numbers]
315
+ filings = [filing for filing in filings if filing['accessionNumber'] not in skip_accession_numbers]
311
316
 
312
- print(f"Generating URLs for {len(filings)} filings...")
317
+ logger.debug(f"Generating URLs for {len(filings)} filings...")
313
318
  urls = []
314
319
  for item in filings:
315
- url = f"{self.BASE_URL}{str(item['accession_number']).zfill(18)}.sgml"
316
- if item['compressed'] == True or item['compressed'] == 'true' or item['compressed'] == 'True':
317
- url += '.zst'
320
+ url = f"{self.BASE_URL}{str(item['accessionNumber']).zfill(18)}.sgml"
318
321
  urls.append(url)
319
322
 
320
323
  if not urls:
321
- print("No submissions found matching the criteria")
324
+ logger.warning("No submissions found matching the criteria")
322
325
  return
323
326
 
324
327
  urls = list(set(urls))
@@ -328,8 +331,8 @@ class Downloader:
328
331
  asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata, max_batch_size=max_batch_size))
329
332
 
330
333
  elapsed_time = time.time() - start_time
331
- print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
332
- print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
334
+ logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
335
+ logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
333
336
 
334
337
  def __del__(self):
335
338
  if hasattr(self, 'loop') and self.loop.is_running():
@@ -348,10 +351,10 @@ class Downloader:
348
351
  for filename in filenames:
349
352
  if not isinstance(filename, str):
350
353
  raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
351
- if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
352
- raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
354
+ if not filename.endswith('.sgml'):
355
+ raise ValueError(f"Invalid filename format: {filename}. Expected .sgml extension.")
353
356
 
354
- print(f"Generating URLs for {len(filenames)} files...")
357
+ logger.debug(f"Generating URLs for {len(filenames)} files...")
355
358
  urls = []
356
359
  for filename in filenames:
357
360
  url = f"{self.BASE_URL}{filename}"
@@ -360,7 +363,7 @@ class Downloader:
360
363
  seen = set()
361
364
  urls = [url for url in urls if not (url in seen or seen.add(url))]
362
365
 
363
- print(f"Downloading {len(urls)} files...")
366
+ logger.debug(f"Downloading {len(urls)} files...")
364
367
 
365
368
  start_time = time.time()
366
369
 
@@ -374,12 +377,13 @@ class Downloader:
374
377
  ))
375
378
 
376
379
  elapsed_time = time.time() - start_time
377
- print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
378
- print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
380
+ logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
381
+ logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
379
382
 
380
383
 
381
384
  def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
382
385
  skip_accession_numbers=[], max_batch_size=1024*1024*1024):
386
+
383
387
  if accession_numbers:
384
388
  accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
385
389
  elif accession_numbers == []:
datamule/submission.py CHANGED
@@ -3,17 +3,21 @@ import json
3
3
  from .document.document import Document
4
4
  from secsgml import parse_sgml_content_into_memory
5
5
  from secsgml.parse_sgml import transform_metadata_string
6
+ from secsgml.utils import bytes_to_str
7
+ from .sec.utils import headers
6
8
  import tarfile
7
9
  import zstandard as zstd
8
10
  import gzip
11
+ import urllib.request
12
+
9
13
 
10
14
 
11
15
  class Submission:
12
16
  def __init__(self, path=None, sgml_content=None, keep_document_types=None,
13
- batch_tar_path=None, accession_prefix=None, portfolio_ref=None):
17
+ batch_tar_path=None, accession_prefix=None, portfolio_ref=None,url=None):
14
18
 
15
19
  # Validate parameters
16
- param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path])
20
+ param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path,url])
17
21
  if param_count != 1:
18
22
  raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
19
23
 
@@ -25,9 +29,19 @@ class Submission:
25
29
  self.accession_prefix = accession_prefix
26
30
  self.portfolio_ref = portfolio_ref
27
31
 
28
- if sgml_content is not None:
32
+ if url is not None or sgml_content is not None:
33
+ if url is not None:
34
+ request = urllib.request.Request(url, headers=headers)
35
+ response = urllib.request.urlopen(request)
36
+
37
+ if response.getcode() == 200:
38
+ sgml_content=response.read()
39
+ else:
40
+ raise ValueError(f"URL: {url}, Error: {response.getcode()}")
41
+
29
42
  self.path = None
30
43
  metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
44
+ metadata = bytes_to_str(metadata)
31
45
 
32
46
  # standardize metadata
33
47
  metadata = transform_metadata_string(metadata)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.7.1
3
+ Version: 1.8.1
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -6,7 +6,7 @@ datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,9
6
6
  datamule/portfolio.py,sha256=tADqQMkFaFyjanbJ0QcaOHGdJJB254rOg29FW7a13l0,11835
7
7
  datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
8
8
  datamule/sheet.py,sha256=V5iR9_LkuwTFxfHCfzgadO6qgB6qOhzWiCAED-y8ZJQ,22744
9
- datamule/submission.py,sha256=ooLsesZ5HkgSWyEFID4u08CobTxdo35eAUHSCB6fw2k,10332
9
+ datamule/submission.py,sha256=IHfEvHcLj9mrJGCNaJSMRqP9kHuJerGGM9IrN5mLDtM,10865
10
10
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
11
11
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  datamule/datamule/sec_connector.py,sha256=qCDsOgSFtfp-uz-APJjX4YrRoIGnnX-xHCL_JjLmRxk,2387
@@ -60,12 +60,12 @@ datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTq
60
60
  datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
61
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
62
62
  datamule/seclibrary/datamule_lookup.py,sha256=_opEh-DRY3ZBXFbuE2Ua_aRwoc1IsV-cPSWK0c61ofY,9465
63
- datamule/seclibrary/downloader.py,sha256=3jEy67oiEg8BF20KcKCx2KC0UjHzhiepdu29TOaHWXs,17564
63
+ datamule/seclibrary/downloader.py,sha256=6cPPddjXekOwlzsyratUqzpCSbvdaNyRCGjQXUtVoJU,17930
64
64
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
65
65
  datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
66
  datamule/utils/construct_submissions_data.py,sha256=aX7ZaAp3zXHLcv4TFk_rGwjb8r7yNDQDFVg4nPf60kM,5934
67
67
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
68
- datamule-1.7.1.dist-info/METADATA,sha256=DQV07IcUFX9kzNglmVZ1MvBip_cv5kiuQTKkGvWFsaQ,524
69
- datamule-1.7.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
70
- datamule-1.7.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
71
- datamule-1.7.1.dist-info/RECORD,,
68
+ datamule-1.8.1.dist-info/METADATA,sha256=EANuFHyM9j25cEgk3wWP9eY1Pgb8hT2xV_g0010zpAA,524
69
+ datamule-1.8.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
70
+ datamule-1.8.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
71
+ datamule-1.8.1.dist-info/RECORD,,