datamule 1.5.8__py3-none-any.whl → 1.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/portfolio.py +10 -3
- datamule/sec/submissions/downloader.py +3 -1
- datamule/sec/submissions/streamer.py +7 -2
- datamule/sec/xbrl/filter_xbrl.py +0 -2
- datamule/seclibrary/downloader.py +14 -27
- {datamule-1.5.8.dist-info → datamule-1.5.9.dist-info}/METADATA +1 -1
- {datamule-1.5.8.dist-info → datamule-1.5.9.dist-info}/RECORD +9 -9
- {datamule-1.5.8.dist-info → datamule-1.5.9.dist-info}/WHEEL +0 -0
- {datamule-1.5.8.dist-info → datamule-1.5.9.dist-info}/top_level.txt +0 -0
datamule/portfolio.py
CHANGED
@@ -127,7 +127,7 @@ class Portfolio:
|
|
127
127
|
self.accession_numbers = new_accession_numbers
|
128
128
|
|
129
129
|
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],
|
130
|
-
requests_per_second=5,keep_filtered_metadata=False,standardize_metadata=True, **kwargs):
|
130
|
+
requests_per_second=5,keep_filtered_metadata=False,standardize_metadata=True,skip_existing=True, **kwargs):
|
131
131
|
if provider is None:
|
132
132
|
config = Config()
|
133
133
|
provider = config.get_default_source()
|
@@ -135,6 +135,11 @@ class Portfolio:
|
|
135
135
|
# Process CIK and metadata filters
|
136
136
|
cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
|
137
137
|
|
138
|
+
accession_numbers = self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
139
|
+
skip_accession_numbers = []
|
140
|
+
if skip_existing:
|
141
|
+
skip_accession_numbers = [sub.accession for sub in self]
|
142
|
+
|
138
143
|
if provider == 'datamule':
|
139
144
|
|
140
145
|
seclibrary_download(
|
@@ -143,10 +148,11 @@ class Portfolio:
|
|
143
148
|
api_key=self.api_key,
|
144
149
|
submission_type=submission_type,
|
145
150
|
filing_date=filing_date,
|
146
|
-
accession_numbers=
|
151
|
+
accession_numbers=accession_numbers,
|
147
152
|
keep_document_types=document_type,
|
148
153
|
keep_filtered_metadata=keep_filtered_metadata,
|
149
154
|
standardize_metadata=standardize_metadata,
|
155
|
+
skip_accession_numbers=skip_accession_numbers
|
150
156
|
)
|
151
157
|
else:
|
152
158
|
sec_download(
|
@@ -155,10 +161,11 @@ class Portfolio:
|
|
155
161
|
submission_type=submission_type,
|
156
162
|
filing_date=filing_date,
|
157
163
|
requests_per_second=requests_per_second,
|
158
|
-
accession_numbers=
|
164
|
+
accession_numbers=accession_numbers,
|
159
165
|
keep_document_types=document_type,
|
160
166
|
keep_filtered_metadata=keep_filtered_metadata,
|
161
167
|
standardize_metadata=standardize_metadata,
|
168
|
+
skip_accession_numbers=skip_accession_numbers
|
162
169
|
)
|
163
170
|
|
164
171
|
self.submissions_loaded = False
|
@@ -5,7 +5,8 @@ from tqdm import tqdm
|
|
5
5
|
|
6
6
|
def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
|
7
7
|
requests_per_second=5, output_dir="filings", accession_numbers=None,
|
8
|
-
quiet=False, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True
|
8
|
+
quiet=False, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
|
9
|
+
skip_accession_numbers=[]):
|
9
10
|
# Make sure output directory exists
|
10
11
|
os.makedirs(output_dir, exist_ok=True)
|
11
12
|
|
@@ -29,5 +30,6 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
|
|
29
30
|
requests_per_second=requests_per_second,
|
30
31
|
document_callback=callback_wrapper,
|
31
32
|
accession_numbers=accession_numbers,
|
33
|
+
skip_accession_numbers=skip_accession_numbers,
|
32
34
|
quiet=quiet
|
33
35
|
)
|
@@ -21,7 +21,7 @@ def fix_filing_url(url):
|
|
21
21
|
return url
|
22
22
|
|
23
23
|
class Streamer(EFTSQuery):
|
24
|
-
def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None, quiet=False):
|
24
|
+
def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None,skip_accession_numbers=None, quiet=False):
|
25
25
|
super().__init__(requests_per_second=requests_per_second, quiet=quiet)
|
26
26
|
self.document_callback = document_callback
|
27
27
|
self.document_queue = asyncio.Queue()
|
@@ -32,6 +32,7 @@ class Streamer(EFTSQuery):
|
|
32
32
|
self.documents_processed = 0
|
33
33
|
self.total_documents = 0
|
34
34
|
self.accession_numbers = accession_numbers
|
35
|
+
self.skip_accession_numbers = skip_accession_numbers
|
35
36
|
self.skipped_documents = 0
|
36
37
|
|
37
38
|
async def _fetch_worker(self):
|
@@ -81,6 +82,9 @@ class Streamer(EFTSQuery):
|
|
81
82
|
if self.accession_numbers is not None and accno_w_dash not in self.accession_numbers:
|
82
83
|
return None, None, None
|
83
84
|
|
85
|
+
if self.skip_accession_numbers is not None and accno_w_dash in self.skip_accession_numbers:
|
86
|
+
return None, None, None
|
87
|
+
|
84
88
|
# Construct the URL
|
85
89
|
url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accno_no_dash}/{accno_w_dash}.txt"
|
86
90
|
url = fix_filing_url(url)
|
@@ -218,7 +222,7 @@ class Streamer(EFTSQuery):
|
|
218
222
|
return results
|
219
223
|
|
220
224
|
def stream(cik=None, submission_type=None, filing_date=None, location=None,
|
221
|
-
requests_per_second=5.0, document_callback=None, accession_numbers=None,
|
225
|
+
requests_per_second=5.0, document_callback=None, accession_numbers=None,skip_accession_numbers=[],
|
222
226
|
quiet=False, name=None):
|
223
227
|
"""
|
224
228
|
Stream EFTS results and download documents into memory.
|
@@ -257,6 +261,7 @@ def stream(cik=None, submission_type=None, filing_date=None, location=None,
|
|
257
261
|
requests_per_second=requests_per_second,
|
258
262
|
document_callback=document_callback,
|
259
263
|
accession_numbers=accession_numbers,
|
264
|
+
skip_accession_numbers=skip_accession_numbers,
|
260
265
|
quiet=quiet
|
261
266
|
)
|
262
267
|
return await streamer.stream(cik, submission_type, filing_date, location, name)
|
datamule/sec/xbrl/filter_xbrl.py
CHANGED
@@ -5,8 +5,6 @@ from ..utils import headers
|
|
5
5
|
def fetch_frame(taxonomy, concept, unit, period):
|
6
6
|
url = f"https://data.sec.gov/api/xbrl/frames/{taxonomy}/{concept}/{unit}/{period}.json"
|
7
7
|
response = requests.get(url, headers=headers)
|
8
|
-
print(url)
|
9
|
-
print(response)
|
10
8
|
return response.json()
|
11
9
|
|
12
10
|
|
@@ -14,7 +14,6 @@ from queue import Queue, Empty
|
|
14
14
|
from threading import Thread
|
15
15
|
from .query import query
|
16
16
|
from os import cpu_count
|
17
|
-
from ..submission import Submission
|
18
17
|
from secsgml import write_sgml_file_to_tar
|
19
18
|
|
20
19
|
|
@@ -235,7 +234,8 @@ class Downloader:
|
|
235
234
|
processor.stop_workers()
|
236
235
|
decompression_pool.shutdown()
|
237
236
|
|
238
|
-
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True
|
237
|
+
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True,
|
238
|
+
skip_accession_numbers=[]):
|
239
239
|
"""
|
240
240
|
Query SEC filings and download/process them.
|
241
241
|
|
@@ -259,10 +259,18 @@ class Downloader:
|
|
259
259
|
filing_date=filing_date,
|
260
260
|
api_key=self.api_key
|
261
261
|
)
|
262
|
+
|
263
|
+
|
262
264
|
# After querying but before generating URLs
|
263
265
|
if accession_numbers:
|
266
|
+
accession_numbers = [str(int(item.replace('-',''))) for item in accession_numbers]
|
264
267
|
filings = [filing for filing in filings if filing['accession_number'] in accession_numbers]
|
265
268
|
|
269
|
+
|
270
|
+
if skip_accession_numbers:
|
271
|
+
skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
|
272
|
+
filings = [filing for filing in filings if filing['accession_number'] not in skip_accession_numbers]
|
273
|
+
|
266
274
|
# Generate URLs from the query results
|
267
275
|
|
268
276
|
print(f"Generating URLs for {len(filings)} filings...")
|
@@ -355,7 +363,8 @@ class Downloader:
|
|
355
363
|
print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
356
364
|
|
357
365
|
|
358
|
-
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True
|
366
|
+
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
|
367
|
+
skip_accession_numbers=[]):
|
359
368
|
"""
|
360
369
|
Query SEC filings and download/process them.
|
361
370
|
|
@@ -383,28 +392,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
|
|
383
392
|
accession_numbers=accession_numbers,
|
384
393
|
keep_document_types=keep_document_types,
|
385
394
|
keep_filtered_metadata=keep_filtered_metadata,
|
386
|
-
standardize_metadata=standardize_metadata
|
395
|
+
standardize_metadata=standardize_metadata,
|
396
|
+
skip_accession_numbers=skip_accession_numbers
|
387
397
|
)
|
388
|
-
|
389
|
-
def download_files_using_filename(filenames, api_key=None, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
|
390
|
-
"""
|
391
|
-
Download and process SEC filings using specific filenames.
|
392
|
-
|
393
|
-
Parameters:
|
394
|
-
- filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
|
395
|
-
- api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
|
396
|
-
- output_dir: Directory to save downloaded files
|
397
|
-
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
398
|
-
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
399
|
-
- standardize_metadata: Whether to standardize metadata format
|
400
|
-
"""
|
401
|
-
downloader = Downloader(api_key=api_key)
|
402
|
-
downloader.QUEUE_SIZE = 1
|
403
|
-
downloader.MAX_CONCURRENT_DOWNLOADS = 1
|
404
|
-
downloader.download_files_using_filename(
|
405
|
-
filenames=filenames,
|
406
|
-
output_dir=output_dir,
|
407
|
-
keep_document_types=keep_document_types,
|
408
|
-
keep_filtered_metadata=keep_filtered_metadata,
|
409
|
-
standardize_metadata=standardize_metadata
|
410
|
-
)
|
@@ -3,7 +3,7 @@ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
|
3
3
|
datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
|
4
4
|
datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
5
5
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
6
|
-
datamule/portfolio.py,sha256=
|
6
|
+
datamule/portfolio.py,sha256=eF1eDSwIg-CI8ZmZAHRjCGU0UhuPN4ijxPB0YDT4s2o,8023
|
7
7
|
datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
|
8
8
|
datamule/submission.py,sha256=6JIi-ayLL-jENVj6Q4IhmrYlAreJI7xBAHP_NYaDB6k,12918
|
9
9
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
@@ -46,21 +46,21 @@ datamule/sec/utils.py,sha256=JUxwijJiqRMnRJNQzVUamyF5h9ZGc7RnO_zsLOIM73g,2079
|
|
46
46
|
datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
47
|
datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNsBw5Jv0Tx5aljiGUJkk7DRk,18745
|
48
48
|
datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
49
|
-
datamule/sec/submissions/downloader.py,sha256=
|
49
|
+
datamule/sec/submissions/downloader.py,sha256=zGS0oJJI8tVF_GnVpZm20MymdYxnjrEjQioSVggw7Ck,1486
|
50
50
|
datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
|
51
51
|
datamule/sec/submissions/monitor.py,sha256=ll0nfHzG8FI3bA8zVFrfsfZGnbt5qAD4rRZ4LG2SORY,9567
|
52
|
-
datamule/sec/submissions/streamer.py,sha256=
|
52
|
+
datamule/sec/submissions/streamer.py,sha256=Qydj40CmWB_wsPv2dibefRohmCokegG2pR7iZ9C3xLQ,11584
|
53
53
|
datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
|
54
54
|
datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
55
55
|
datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
|
56
|
-
datamule/sec/xbrl/filter_xbrl.py,sha256=
|
56
|
+
datamule/sec/xbrl/filter_xbrl.py,sha256=QiSfm7tsJVLIw2PFqGh8D01qsRe_ZB-mbFhr6KcBa8A,1281
|
57
57
|
datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H5N2tbvKUzM,3307
|
58
58
|
datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
|
59
59
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
60
60
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
61
|
-
datamule/seclibrary/downloader.py,sha256=
|
61
|
+
datamule/seclibrary/downloader.py,sha256=ylv69VF22IVfrdeCkiGr5mVa2GKrPC9zFiDJU1fiBu8,17262
|
62
62
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
63
|
-
datamule-1.5.
|
64
|
-
datamule-1.5.
|
65
|
-
datamule-1.5.
|
66
|
-
datamule-1.5.
|
63
|
+
datamule-1.5.9.dist-info/METADATA,sha256=DkoMbTIImVjWfEkqwfe7BBqCpkvBC8CFRRF5v7PKyco,501
|
64
|
+
datamule-1.5.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
65
|
+
datamule-1.5.9.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
66
|
+
datamule-1.5.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|