datamule 1.5.8__py3-none-any.whl → 1.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/portfolio.py CHANGED
@@ -127,7 +127,7 @@ class Portfolio:
127
127
  self.accession_numbers = new_accession_numbers
128
128
 
129
129
  def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],
130
- requests_per_second=5,keep_filtered_metadata=False,standardize_metadata=True, **kwargs):
130
+ requests_per_second=5,keep_filtered_metadata=False,standardize_metadata=True,skip_existing=True, **kwargs):
131
131
  if provider is None:
132
132
  config = Config()
133
133
  provider = config.get_default_source()
@@ -135,6 +135,11 @@ class Portfolio:
135
135
  # Process CIK and metadata filters
136
136
  cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
137
137
 
138
+ accession_numbers = self.accession_numbers if hasattr(self, 'accession_numbers') else None
139
+ skip_accession_numbers = []
140
+ if skip_existing:
141
+ skip_accession_numbers = [sub.accession for sub in self]
142
+
138
143
  if provider == 'datamule':
139
144
 
140
145
  seclibrary_download(
@@ -143,10 +148,11 @@ class Portfolio:
143
148
  api_key=self.api_key,
144
149
  submission_type=submission_type,
145
150
  filing_date=filing_date,
146
- accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
151
+ accession_numbers=accession_numbers,
147
152
  keep_document_types=document_type,
148
153
  keep_filtered_metadata=keep_filtered_metadata,
149
154
  standardize_metadata=standardize_metadata,
155
+ skip_accession_numbers=skip_accession_numbers
150
156
  )
151
157
  else:
152
158
  sec_download(
@@ -155,10 +161,11 @@ class Portfolio:
155
161
  submission_type=submission_type,
156
162
  filing_date=filing_date,
157
163
  requests_per_second=requests_per_second,
158
- accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
164
+ accession_numbers=accession_numbers,
159
165
  keep_document_types=document_type,
160
166
  keep_filtered_metadata=keep_filtered_metadata,
161
167
  standardize_metadata=standardize_metadata,
168
+ skip_accession_numbers=skip_accession_numbers
162
169
  )
163
170
 
164
171
  self.submissions_loaded = False
@@ -5,7 +5,8 @@ from tqdm import tqdm
5
5
 
6
6
  def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
7
7
  requests_per_second=5, output_dir="filings", accession_numbers=None,
8
- quiet=False, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
8
+ quiet=False, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
9
+ skip_accession_numbers=[]):
9
10
  # Make sure output directory exists
10
11
  os.makedirs(output_dir, exist_ok=True)
11
12
 
@@ -29,5 +30,6 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
29
30
  requests_per_second=requests_per_second,
30
31
  document_callback=callback_wrapper,
31
32
  accession_numbers=accession_numbers,
33
+ skip_accession_numbers=skip_accession_numbers,
32
34
  quiet=quiet
33
35
  )
@@ -21,7 +21,7 @@ def fix_filing_url(url):
21
21
  return url
22
22
 
23
23
  class Streamer(EFTSQuery):
24
- def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None, quiet=False):
24
+ def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None,skip_accession_numbers=None, quiet=False):
25
25
  super().__init__(requests_per_second=requests_per_second, quiet=quiet)
26
26
  self.document_callback = document_callback
27
27
  self.document_queue = asyncio.Queue()
@@ -32,6 +32,7 @@ class Streamer(EFTSQuery):
32
32
  self.documents_processed = 0
33
33
  self.total_documents = 0
34
34
  self.accession_numbers = accession_numbers
35
+ self.skip_accession_numbers = skip_accession_numbers
35
36
  self.skipped_documents = 0
36
37
 
37
38
  async def _fetch_worker(self):
@@ -81,6 +82,9 @@ class Streamer(EFTSQuery):
81
82
  if self.accession_numbers is not None and accno_w_dash not in self.accession_numbers:
82
83
  return None, None, None
83
84
 
85
+ if self.skip_accession_numbers is not None and accno_w_dash in self.skip_accession_numbers:
86
+ return None, None, None
87
+
84
88
  # Construct the URL
85
89
  url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accno_no_dash}/{accno_w_dash}.txt"
86
90
  url = fix_filing_url(url)
@@ -218,7 +222,7 @@ class Streamer(EFTSQuery):
218
222
  return results
219
223
 
220
224
  def stream(cik=None, submission_type=None, filing_date=None, location=None,
221
- requests_per_second=5.0, document_callback=None, accession_numbers=None,
225
+ requests_per_second=5.0, document_callback=None, accession_numbers=None,skip_accession_numbers=[],
222
226
  quiet=False, name=None):
223
227
  """
224
228
  Stream EFTS results and download documents into memory.
@@ -257,6 +261,7 @@ def stream(cik=None, submission_type=None, filing_date=None, location=None,
257
261
  requests_per_second=requests_per_second,
258
262
  document_callback=document_callback,
259
263
  accession_numbers=accession_numbers,
264
+ skip_accession_numbers=skip_accession_numbers,
260
265
  quiet=quiet
261
266
  )
262
267
  return await streamer.stream(cik, submission_type, filing_date, location, name)
@@ -5,8 +5,6 @@ from ..utils import headers
5
5
  def fetch_frame(taxonomy, concept, unit, period):
6
6
  url = f"https://data.sec.gov/api/xbrl/frames/{taxonomy}/{concept}/{unit}/{period}.json"
7
7
  response = requests.get(url, headers=headers)
8
- print(url)
9
- print(response)
10
8
  return response.json()
11
9
 
12
10
 
@@ -14,7 +14,6 @@ from queue import Queue, Empty
14
14
  from threading import Thread
15
15
  from .query import query
16
16
  from os import cpu_count
17
- from ..submission import Submission
18
17
  from secsgml import write_sgml_file_to_tar
19
18
 
20
19
 
@@ -235,7 +234,8 @@ class Downloader:
235
234
  processor.stop_workers()
236
235
  decompression_pool.shutdown()
237
236
 
238
- def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
237
+ def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True,
238
+ skip_accession_numbers=[]):
239
239
  """
240
240
  Query SEC filings and download/process them.
241
241
 
@@ -259,10 +259,18 @@ class Downloader:
259
259
  filing_date=filing_date,
260
260
  api_key=self.api_key
261
261
  )
262
+
263
+
262
264
  # After querying but before generating URLs
263
265
  if accession_numbers:
266
+ accession_numbers = [str(int(item.replace('-',''))) for item in accession_numbers]
264
267
  filings = [filing for filing in filings if filing['accession_number'] in accession_numbers]
265
268
 
269
+
270
+ if skip_accession_numbers:
271
+ skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
272
+ filings = [filing for filing in filings if filing['accession_number'] not in skip_accession_numbers]
273
+
266
274
  # Generate URLs from the query results
267
275
 
268
276
  print(f"Generating URLs for {len(filings)} filings...")
@@ -355,7 +363,8 @@ class Downloader:
355
363
  print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
356
364
 
357
365
 
358
- def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
366
+ def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
367
+ skip_accession_numbers=[]):
359
368
  """
360
369
  Query SEC filings and download/process them.
361
370
 
@@ -383,28 +392,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
383
392
  accession_numbers=accession_numbers,
384
393
  keep_document_types=keep_document_types,
385
394
  keep_filtered_metadata=keep_filtered_metadata,
386
- standardize_metadata=standardize_metadata
395
+ standardize_metadata=standardize_metadata,
396
+ skip_accession_numbers=skip_accession_numbers
387
397
  )
388
-
389
- def download_files_using_filename(filenames, api_key=None, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
390
- """
391
- Download and process SEC filings using specific filenames.
392
-
393
- Parameters:
394
- - filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
395
- - api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
396
- - output_dir: Directory to save downloaded files
397
- - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
398
- - keep_filtered_metadata: Whether to keep metadata for filtered documents
399
- - standardize_metadata: Whether to standardize metadata format
400
- """
401
- downloader = Downloader(api_key=api_key)
402
- downloader.QUEUE_SIZE = 1
403
- downloader.MAX_CONCURRENT_DOWNLOADS = 1
404
- downloader.download_files_using_filename(
405
- filenames=filenames,
406
- output_dir=output_dir,
407
- keep_document_types=keep_document_types,
408
- keep_filtered_metadata=keep_filtered_metadata,
409
- standardize_metadata=standardize_metadata
410
- )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.5.8
3
+ Version: 1.5.9
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -3,7 +3,7 @@ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
3
  datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
4
4
  datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
5
5
  datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
6
- datamule/portfolio.py,sha256=Ijx4JFRHSzPoGJRdOTv8c90x79M80LlAXUhUncwYZSo,7755
6
+ datamule/portfolio.py,sha256=eF1eDSwIg-CI8ZmZAHRjCGU0UhuPN4ijxPB0YDT4s2o,8023
7
7
  datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
8
8
  datamule/submission.py,sha256=6JIi-ayLL-jENVj6Q4IhmrYlAreJI7xBAHP_NYaDB6k,12918
9
9
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
@@ -46,21 +46,21 @@ datamule/sec/utils.py,sha256=JUxwijJiqRMnRJNQzVUamyF5h9ZGc7RnO_zsLOIM73g,2079
46
46
  datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNsBw5Jv0Tx5aljiGUJkk7DRk,18745
48
48
  datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
- datamule/sec/submissions/downloader.py,sha256=tDWn8bsK9XabQo2pBGYSiqTw37MmqM8rEma8Ph7zp-o,1391
49
+ datamule/sec/submissions/downloader.py,sha256=zGS0oJJI8tVF_GnVpZm20MymdYxnjrEjQioSVggw7Ck,1486
50
50
  datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
51
51
  datamule/sec/submissions/monitor.py,sha256=ll0nfHzG8FI3bA8zVFrfsfZGnbt5qAD4rRZ4LG2SORY,9567
52
- datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
52
+ datamule/sec/submissions/streamer.py,sha256=Qydj40CmWB_wsPv2dibefRohmCokegG2pR7iZ9C3xLQ,11584
53
53
  datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
54
54
  datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
55
  datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
56
- datamule/sec/xbrl/filter_xbrl.py,sha256=g9OT4zrNS0tiUJeBIwbCs_zMisOBkpFnMR3tV4Tr39Q,1316
56
+ datamule/sec/xbrl/filter_xbrl.py,sha256=QiSfm7tsJVLIw2PFqGh8D01qsRe_ZB-mbFhr6KcBa8A,1281
57
57
  datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H5N2tbvKUzM,3307
58
58
  datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
59
59
  datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
61
- datamule/seclibrary/downloader.py,sha256=wNRURTGb3eqg12Ltt4578L0WcAm7DmCWg0Rm0Om6Z4U,17959
61
+ datamule/seclibrary/downloader.py,sha256=ylv69VF22IVfrdeCkiGr5mVa2GKrPC9zFiDJU1fiBu8,17262
62
62
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
63
- datamule-1.5.8.dist-info/METADATA,sha256=kfV8_aDjqzk6OZKmJn4GIffpvTW-SYi55O1qSOEnsGQ,501
64
- datamule-1.5.8.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
65
- datamule-1.5.8.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
66
- datamule-1.5.8.dist-info/RECORD,,
63
+ datamule-1.5.9.dist-info/METADATA,sha256=DkoMbTIImVjWfEkqwfe7BBqCpkvBC8CFRRF5v7PKyco,501
64
+ datamule-1.5.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
65
+ datamule-1.5.9.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
66
+ datamule-1.5.9.dist-info/RECORD,,