datamule 1.5.3__py3-none-any.whl → 1.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/sec/submissions/monitor.py +45 -25
- datamule/seclibrary/downloader.py +82 -0
- datamule/submission.py +8 -0
- {datamule-1.5.3.dist-info → datamule-1.5.5.dist-info}/METADATA +1 -1
- {datamule-1.5.3.dist-info → datamule-1.5.5.dist-info}/RECORD +7 -7
- {datamule-1.5.3.dist-info → datamule-1.5.5.dist-info}/WHEEL +0 -0
- {datamule-1.5.3.dist-info → datamule-1.5.5.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ import asyncio
|
|
7
7
|
from ..utils import headers, PreciseRateLimiter
|
8
8
|
from .eftsquery import EFTSQuery
|
9
9
|
import aiohttp
|
10
|
-
|
10
|
+
from zoneinfo import ZoneInfo
|
11
11
|
|
12
12
|
async def poll_rss(limiter):
|
13
13
|
base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
|
@@ -77,15 +77,22 @@ class Monitor():
|
|
77
77
|
)
|
78
78
|
|
79
79
|
async def _async_monitor_submissions(self, data_callback=None, interval_callback=None,
|
80
|
-
|
81
|
-
|
80
|
+
polling_interval=1000, quiet=True, start_date=None,
|
81
|
+
validation_interval=60000):
|
82
82
|
"""
|
83
83
|
Async implementation of monitor_submissions.
|
84
|
+
Either polling_interval or validation_interval (or both) must be specified.
|
85
|
+
If polling_interval is None, only EFTS validation will be performed.
|
86
|
+
If validation_interval is None, only RSS polling will be performed.
|
84
87
|
"""
|
85
88
|
|
89
|
+
# Validate that at least one interval is specified
|
90
|
+
if polling_interval is None and validation_interval is None:
|
91
|
+
raise ValueError("At least one of polling_interval or validation_interval must be specified")
|
92
|
+
|
86
93
|
# Backfill if start_date is provided
|
87
94
|
if start_date is not None:
|
88
|
-
today_date = datetime.now(
|
95
|
+
today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
|
89
96
|
if not quiet:
|
90
97
|
print(f"Backfilling from {start_date} to {today_date}")
|
91
98
|
|
@@ -100,26 +107,35 @@ class Monitor():
|
|
100
107
|
if new_hits and data_callback:
|
101
108
|
data_callback(new_hits)
|
102
109
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
110
|
+
# Initialize timing variables
|
111
|
+
current_time = time.time()
|
112
|
+
last_polling_time = current_time
|
113
|
+
last_validation_time = current_time
|
114
|
+
|
115
|
+
# Determine which operations to perform
|
116
|
+
do_polling = polling_interval is not None
|
117
|
+
do_validation = validation_interval is not None
|
118
|
+
|
107
119
|
while True:
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
new_results = self._filter_new_accessions(results)
|
113
|
-
if new_results:
|
120
|
+
current_time = time.time()
|
121
|
+
|
122
|
+
# RSS polling (if enabled)
|
123
|
+
if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
|
114
124
|
if not quiet:
|
115
|
-
print(f"
|
116
|
-
|
117
|
-
|
125
|
+
print(f"Polling RSS feed")
|
126
|
+
results = await poll_rss(self.ratelimiters['sec.gov'])
|
127
|
+
new_results = self._filter_new_accessions(results)
|
128
|
+
if new_results:
|
129
|
+
if not quiet:
|
130
|
+
print(f"Found {len(new_results)} new submissions via RSS")
|
131
|
+
if data_callback:
|
132
|
+
data_callback(new_results)
|
133
|
+
last_polling_time = current_time
|
118
134
|
|
119
|
-
# EFTS validation
|
120
|
-
if
|
135
|
+
# EFTS validation (if enabled)
|
136
|
+
if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
|
121
137
|
# Get submissions from the last 24 hours for validation
|
122
|
-
today_date = datetime.now().strftime('%Y-%m-%d')
|
138
|
+
today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
|
123
139
|
if not quiet:
|
124
140
|
print(f"Validating submissions from {today_date}")
|
125
141
|
|
@@ -134,19 +150,23 @@ class Monitor():
|
|
134
150
|
print(f"Found {len(new_hits)} new submissions via EFTS validation")
|
135
151
|
if data_callback:
|
136
152
|
data_callback(new_hits)
|
137
|
-
last_polling_time = time.time()
|
138
153
|
last_validation_time = current_time
|
139
154
|
|
140
155
|
# Interval callback
|
141
156
|
if interval_callback:
|
142
157
|
interval_callback()
|
143
158
|
|
144
|
-
|
159
|
+
# Calculate next wake-up time
|
160
|
+
next_times = []
|
161
|
+
if do_polling:
|
162
|
+
next_times.append(last_polling_time + (polling_interval / 1000))
|
163
|
+
if do_validation:
|
164
|
+
next_times.append(last_validation_time + (validation_interval / 1000))
|
165
|
+
|
166
|
+
next_wake_time = min(next_times)
|
145
167
|
current_time = time.time()
|
146
|
-
time_to_sleep = max(0,
|
168
|
+
time_to_sleep = max(0, next_wake_time - current_time)
|
147
169
|
await asyncio.sleep(time_to_sleep)
|
148
|
-
last_polling_time = next_poll_time
|
149
|
-
|
150
170
|
|
151
171
|
def monitor_submissions(self, data_callback=None, interval_callback=None,
|
152
172
|
polling_interval=1000, quiet=True, start_date=None,
|
@@ -98,6 +98,7 @@ class Downloader:
|
|
98
98
|
filename, content = item
|
99
99
|
output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
|
100
100
|
write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types, keep_filtered_metadata=self.keep_filtered_metadata,standardize_metadata=self.standardize_metadata)
|
101
|
+
|
101
102
|
self.pbar.update(1)
|
102
103
|
|
103
104
|
def _processing_worker(self):
|
@@ -296,6 +297,64 @@ class Downloader:
|
|
296
297
|
self.loop.call_soon_threadsafe(self.loop.stop)
|
297
298
|
|
298
299
|
|
300
|
+
|
301
|
+
def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
|
302
|
+
"""
|
303
|
+
Download and process SEC filings using specific filenames.
|
304
|
+
|
305
|
+
Parameters:
|
306
|
+
- filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
|
307
|
+
- output_dir: Directory to save downloaded files
|
308
|
+
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
309
|
+
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
310
|
+
- standardize_metadata: Whether to standardize metadata format
|
311
|
+
"""
|
312
|
+
if self.api_key is None:
|
313
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
314
|
+
|
315
|
+
if not filenames:
|
316
|
+
raise ValueError("No filenames provided")
|
317
|
+
|
318
|
+
if not isinstance(filenames, (list, tuple)):
|
319
|
+
filenames = [filenames]
|
320
|
+
|
321
|
+
# Validate filenames format
|
322
|
+
for filename in filenames:
|
323
|
+
if not isinstance(filename, str):
|
324
|
+
raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
|
325
|
+
if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
|
326
|
+
raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
|
327
|
+
|
328
|
+
# Generate URLs directly from filenames
|
329
|
+
print(f"Generating URLs for {len(filenames)} files...")
|
330
|
+
urls = []
|
331
|
+
for filename in filenames:
|
332
|
+
url = f"{self.BASE_URL}{filename}"
|
333
|
+
urls.append(url)
|
334
|
+
|
335
|
+
# Remove duplicates while preserving order
|
336
|
+
seen = set()
|
337
|
+
urls = [url for url in urls if not (url in seen or seen.add(url))]
|
338
|
+
|
339
|
+
print(f"Downloading {len(urls)} files...")
|
340
|
+
|
341
|
+
# Process the batch asynchronously using existing infrastructure
|
342
|
+
start_time = time.time()
|
343
|
+
|
344
|
+
asyncio.run(self.process_batch(
|
345
|
+
urls,
|
346
|
+
output_dir,
|
347
|
+
keep_document_types=keep_document_types,
|
348
|
+
keep_filtered_metadata=keep_filtered_metadata,
|
349
|
+
standardize_metadata=standardize_metadata
|
350
|
+
))
|
351
|
+
|
352
|
+
# Calculate and display performance metrics
|
353
|
+
elapsed_time = time.time() - start_time
|
354
|
+
print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
|
355
|
+
print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
356
|
+
|
357
|
+
|
299
358
|
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
|
300
359
|
"""
|
301
360
|
Query SEC filings and download/process them.
|
@@ -325,4 +384,27 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
|
|
325
384
|
keep_document_types=keep_document_types,
|
326
385
|
keep_filtered_metadata=keep_filtered_metadata,
|
327
386
|
standardize_metadata=standardize_metadata
|
387
|
+
)
|
388
|
+
|
389
|
+
def download_files_using_filename(filenames, api_key=None, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
|
390
|
+
"""
|
391
|
+
Download and process SEC filings using specific filenames.
|
392
|
+
|
393
|
+
Parameters:
|
394
|
+
- filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
|
395
|
+
- api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
|
396
|
+
- output_dir: Directory to save downloaded files
|
397
|
+
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
398
|
+
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
399
|
+
- standardize_metadata: Whether to standardize metadata format
|
400
|
+
"""
|
401
|
+
downloader = Downloader(api_key=api_key)
|
402
|
+
downloader.QUEUE_SIZE = 1
|
403
|
+
downloader.MAX_CONCURRENT_DOWNLOADS = 1
|
404
|
+
downloader.download_files_using_filename(
|
405
|
+
filenames=filenames,
|
406
|
+
output_dir=output_dir,
|
407
|
+
keep_document_types=keep_document_types,
|
408
|
+
keep_filtered_metadata=keep_filtered_metadata,
|
409
|
+
standardize_metadata=standardize_metadata
|
328
410
|
)
|
datamule/submission.py
CHANGED
@@ -3,6 +3,7 @@ import json
|
|
3
3
|
from .document.document import Document
|
4
4
|
from secsgml import parse_sgml_content_into_memory
|
5
5
|
from secsgml.utils import bytes_to_str
|
6
|
+
from secsgml.parse_sgml import transform_metadata_string
|
6
7
|
import tarfile
|
7
8
|
import shutil
|
8
9
|
import zstandard as zstd
|
@@ -86,6 +87,10 @@ class Submission:
|
|
86
87
|
if sgml_content is not None:
|
87
88
|
self.path = None
|
88
89
|
metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
|
90
|
+
|
91
|
+
# standardize metadata
|
92
|
+
metadata = transform_metadata_string(metadata)
|
93
|
+
|
89
94
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
|
90
95
|
# code dupe
|
91
96
|
self.accession = self.metadata.content['accession-number']
|
@@ -123,6 +128,9 @@ class Submission:
|
|
123
128
|
metadata_path = self.path / 'metadata.json'
|
124
129
|
with metadata_path.open('r') as f:
|
125
130
|
metadata = json.load(f)
|
131
|
+
|
132
|
+
# standardize metadata
|
133
|
+
metadata = transform_metadata_string(metadata)
|
126
134
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
127
135
|
self.accession = self.metadata.content['accession-number']
|
128
136
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
@@ -5,7 +5,7 @@ datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
|
5
5
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
6
6
|
datamule/portfolio.py,sha256=iW54frGfoCQb-6aYfocDqQQPe0gc_22voedv0It_1q0,7517
|
7
7
|
datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
|
8
|
-
datamule/submission.py,sha256=
|
8
|
+
datamule/submission.py,sha256=6JIi-ayLL-jENVj6Q4IhmrYlAreJI7xBAHP_NYaDB6k,12918
|
9
9
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
10
10
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
datamule/document/document.py,sha256=04Rivdphq0D1HEGIBjtl1LelJr-IyQU1qCMi8yNJajw,14038
|
@@ -46,7 +46,7 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
|
|
46
46
|
datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
47
|
datamule/sec/submissions/downloader.py,sha256=tDWn8bsK9XabQo2pBGYSiqTw37MmqM8rEma8Ph7zp-o,1391
|
48
48
|
datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
|
49
|
-
datamule/sec/submissions/monitor.py,sha256
|
49
|
+
datamule/sec/submissions/monitor.py,sha256=-WewsDyj53bOAgBY3iEOB2PYsOtVcSAFCtiNZ-eV-Nw,9064
|
50
50
|
datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
|
51
51
|
datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
|
52
52
|
datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -56,9 +56,9 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
|
|
56
56
|
datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
|
57
57
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
58
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
59
|
-
datamule/seclibrary/downloader.py,sha256=
|
59
|
+
datamule/seclibrary/downloader.py,sha256=wNRURTGb3eqg12Ltt4578L0WcAm7DmCWg0Rm0Om6Z4U,17959
|
60
60
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
61
|
-
datamule-1.5.
|
62
|
-
datamule-1.5.
|
63
|
-
datamule-1.5.
|
64
|
-
datamule-1.5.
|
61
|
+
datamule-1.5.5.dist-info/METADATA,sha256=lYQT8OyN09OciItYc_Dtxklt4a11n2QyXzSEyzOmR6Q,469
|
62
|
+
datamule-1.5.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
63
|
+
datamule-1.5.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
64
|
+
datamule-1.5.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|