datamule 1.1.0__py3-none-any.whl → 1.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +5 -1
- datamule/index.py +62 -0
- datamule/sec/submissions/eftsquery.py +37 -22
- datamule/sec/submissions/textsearch.py +10 -6
- datamule/{book/book.py → sheet.py} +3 -3
- {datamule-1.1.0.dist-info → datamule-1.1.5.dist-info}/METADATA +1 -1
- {datamule-1.1.0.dist-info → datamule-1.1.5.dist-info}/RECORD +9 -9
- datamule/book/__init__.py +0 -0
- {datamule-1.1.0.dist-info → datamule-1.1.5.dist-info}/WHEEL +0 -0
- {datamule-1.1.0.dist-info → datamule-1.1.5.dist-info}/top_level.txt +0 -0
datamule/__init__.py
CHANGED
@@ -3,6 +3,8 @@ from .portfolio import Portfolio
|
|
3
3
|
from .document import Document
|
4
4
|
from .helper import _load_package_csv, load_package_dataset
|
5
5
|
from .config import Config
|
6
|
+
from .sheet import Sheet
|
7
|
+
from .index import Index
|
6
8
|
|
7
9
|
|
8
10
|
# Keep the notebook environment setup
|
@@ -32,5 +34,7 @@ __all__ = [
|
|
32
34
|
'Portfolio',
|
33
35
|
'Submission',
|
34
36
|
'Document',
|
35
|
-
'Config'
|
37
|
+
'Config',
|
38
|
+
'Sheet',
|
39
|
+
'Index',
|
36
40
|
]
|
datamule/index.py
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from .sec.submissions.textsearch import query
|
3
|
+
from .helper import _process_cik_and_metadata_filters, load_package_dataset
|
4
|
+
|
5
|
+
class Index:
|
6
|
+
def __init__(self, path=None):
|
7
|
+
self.path = Path(path) if path else None
|
8
|
+
|
9
|
+
def search_submissions(
|
10
|
+
self,
|
11
|
+
text_query,
|
12
|
+
start_date=None,
|
13
|
+
end_date=None,
|
14
|
+
submission_type=None,
|
15
|
+
cik=None,
|
16
|
+
ticker=None,
|
17
|
+
requests_per_second=5.0,
|
18
|
+
quiet=True,
|
19
|
+
**kwargs
|
20
|
+
):
|
21
|
+
"""
|
22
|
+
Search SEC filings for the given text query.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
text_query (str): Text to search for in SEC filings.
|
26
|
+
start_date (str or date, optional): Start date for filing search.
|
27
|
+
end_date (str or date, optional): End date for filing search.
|
28
|
+
submission_type (str, optional): Type of SEC submission to search.
|
29
|
+
cik (str, int, or list, optional): CIK(s) to filter by.
|
30
|
+
ticker (str or list, optional): Ticker(s) to filter by.
|
31
|
+
requests_per_second (float, optional): Rate limit for SEC API requests.
|
32
|
+
quiet (bool, optional): Whether to suppress output.
|
33
|
+
**kwargs: Additional filters to apply.
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
dict: Search results from the query function.
|
37
|
+
"""
|
38
|
+
# Process CIK and ticker filters if provided
|
39
|
+
if cik is not None or ticker is not None:
|
40
|
+
cik_list = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
|
41
|
+
# Add CIK filter to the query if we have results
|
42
|
+
if cik_list:
|
43
|
+
# Implementation note: Update as needed - this assumes your query function
|
44
|
+
# can accept a cik parameter, otherwise you may need additional logic here
|
45
|
+
kwargs['cik'] = cik_list
|
46
|
+
|
47
|
+
# Execute the search query
|
48
|
+
results = query(
|
49
|
+
f'{text_query}',
|
50
|
+
filing_date=(start_date, end_date),
|
51
|
+
requests_per_second=requests_per_second,
|
52
|
+
quiet=quiet,
|
53
|
+
submission_type=submission_type,
|
54
|
+
**kwargs
|
55
|
+
)
|
56
|
+
|
57
|
+
# Save results to path if specified
|
58
|
+
if self.path:
|
59
|
+
self._save_results(results, text_query)
|
60
|
+
|
61
|
+
return results
|
62
|
+
|
@@ -6,13 +6,14 @@ from tqdm import tqdm
|
|
6
6
|
from ..utils import RetryException, PreciseRateLimiter, RateMonitor, headers
|
7
7
|
|
8
8
|
class EFTSQuery:
|
9
|
-
def __init__(self, requests_per_second=5.0):
|
9
|
+
def __init__(self, requests_per_second=5.0, quiet=False):
|
10
10
|
self.base_url = "https://efts.sec.gov/LATEST/search-index"
|
11
11
|
self.headers = headers
|
12
12
|
self.limiter = PreciseRateLimiter(requests_per_second)
|
13
13
|
self.rate_monitor = RateMonitor()
|
14
14
|
self.session = None
|
15
15
|
self.pbar = None
|
16
|
+
self.quiet = quiet
|
16
17
|
self.max_page_size = 100 # EFTS API limit
|
17
18
|
self.fetch_queue = asyncio.Queue()
|
18
19
|
self.connection_semaphore = asyncio.Semaphore(5) # Max 5 concurrent connections
|
@@ -127,6 +128,8 @@ class EFTSQuery:
|
|
127
128
|
return ", ".join(parts)
|
128
129
|
|
129
130
|
async def _fetch_json(self, url):
|
131
|
+
if not self.quiet:
|
132
|
+
print(f"Fetching {url}...")
|
130
133
|
async with self.connection_semaphore:
|
131
134
|
async with self.limiter:
|
132
135
|
try:
|
@@ -160,18 +163,21 @@ class EFTSQuery:
|
|
160
163
|
await callback(hits)
|
161
164
|
self.fetch_queue.task_done()
|
162
165
|
except RetryException as e:
|
163
|
-
|
166
|
+
if not self.quiet:
|
167
|
+
print(f"\nRate limited. Sleeping for {e.retry_after} seconds...")
|
164
168
|
await asyncio.sleep(e.retry_after)
|
165
169
|
# Put back in queue
|
166
170
|
await self.fetch_queue.put((params, from_val, size_val, callback))
|
167
171
|
self.fetch_queue.task_done()
|
168
172
|
except Exception as e:
|
169
|
-
|
173
|
+
if not self.quiet:
|
174
|
+
print(f"\nError fetching {url}: {str(e)}")
|
170
175
|
self.fetch_queue.task_done()
|
171
176
|
except asyncio.CancelledError:
|
172
177
|
break
|
173
178
|
except Exception as e:
|
174
|
-
|
179
|
+
if not self.quiet:
|
180
|
+
print(f"\nWorker error: {str(e)}")
|
175
181
|
self.fetch_queue.task_done()
|
176
182
|
|
177
183
|
def _split_date_range(self, start_date, end_date, num_splits=4):
|
@@ -322,12 +328,14 @@ class EFTSQuery:
|
|
322
328
|
|
323
329
|
# Skip if no results
|
324
330
|
if total_hits == 0:
|
325
|
-
|
331
|
+
if not self.quiet:
|
332
|
+
print(f"Skipping negated forms query - no results returned")
|
326
333
|
return
|
327
334
|
|
328
|
-
|
329
|
-
|
330
|
-
|
335
|
+
if not self.quiet:
|
336
|
+
query_desc = self._get_query_description(params)
|
337
|
+
date_range = f"{start_date} to {end_date}"
|
338
|
+
print(f"Planning: Analyzing negated forms query (depth {depth}): {date_range} [{total_hits:,} hits]")
|
331
339
|
|
332
340
|
# If small enough or at max depth, process directly
|
333
341
|
if total_hits < self.max_efts_hits or start_date == end_date:
|
@@ -350,8 +358,9 @@ class EFTSQuery:
|
|
350
358
|
|
351
359
|
total_hits, data = await self._test_query_size(params)
|
352
360
|
|
353
|
-
|
354
|
-
|
361
|
+
if not self.quiet:
|
362
|
+
query_desc = self._get_query_description(params)
|
363
|
+
print(f"Planning: Analyzing {' '*depth}query: {query_desc} [{total_hits:,} hits]")
|
355
364
|
|
356
365
|
# If we're at the maximum recursion depth or hits are under limit, process directly
|
357
366
|
if depth >= max_depth or total_hits < self.max_efts_hits:
|
@@ -396,8 +405,9 @@ class EFTSQuery:
|
|
396
405
|
|
397
406
|
async def _start_query_phase(self, callback):
|
398
407
|
"""Start the query phase after planning is complete"""
|
399
|
-
|
400
|
-
|
408
|
+
if not self.quiet:
|
409
|
+
print("\n--- Starting query phase ---")
|
410
|
+
self.pbar = tqdm(total=self.total_results_to_fetch, desc="Querying documents [Rate: 0/s | 0 MB/s]")
|
401
411
|
|
402
412
|
# Queue all pending page requests
|
403
413
|
for params, from_val, size_val, callback in self.pending_page_requests:
|
@@ -425,18 +435,21 @@ class EFTSQuery:
|
|
425
435
|
self.pbar = None
|
426
436
|
|
427
437
|
# First check size
|
428
|
-
|
429
|
-
|
438
|
+
if not self.quiet:
|
439
|
+
print("\n--- Starting query planning phase ---")
|
440
|
+
print("Analyzing request and splitting into manageable chunks...")
|
430
441
|
|
431
442
|
total_hits, data = await self._test_query_size(params)
|
432
443
|
|
433
444
|
if total_hits == 0:
|
434
|
-
|
445
|
+
if not self.quiet:
|
446
|
+
print("No results found for this query.")
|
435
447
|
return []
|
436
448
|
|
437
449
|
# Get accurate total from aggregation buckets
|
438
450
|
self.true_total_docs = self._get_total_from_buckets(data)
|
439
|
-
|
451
|
+
if not self.quiet:
|
452
|
+
print(f"Found {self.true_total_docs:,} total documents to retrieve.")
|
440
453
|
|
441
454
|
# Start worker tasks
|
442
455
|
workers = [asyncio.create_task(self._fetch_worker()) for _ in range(5)]
|
@@ -458,7 +471,8 @@ class EFTSQuery:
|
|
458
471
|
negated_forms.append('-0') # Keep primary documents constraint
|
459
472
|
|
460
473
|
remaining_docs = self.true_total_docs - self.processed_doc_count
|
461
|
-
|
474
|
+
if not self.quiet:
|
475
|
+
print(f"Planning: Analyzing remaining primary document forms using negation (~{remaining_docs:,} hits)")
|
462
476
|
|
463
477
|
# Process negated forms query with recursive date splitting
|
464
478
|
start_date = params['startdt']
|
@@ -466,9 +480,9 @@ class EFTSQuery:
|
|
466
480
|
await self._process_negated_forms_recursive(
|
467
481
|
params, negated_forms, start_date, end_date, 0, collect_hits
|
468
482
|
)
|
469
|
-
|
483
|
+
elif not self.quiet:
|
470
484
|
print("No additional forms to process with negation - not a primary documents query")
|
471
|
-
|
485
|
+
elif not self.quiet:
|
472
486
|
print("No additional forms to process with negation")
|
473
487
|
|
474
488
|
# Start the download phase
|
@@ -488,15 +502,16 @@ class EFTSQuery:
|
|
488
502
|
self.pbar.close()
|
489
503
|
self.pbar = None
|
490
504
|
|
491
|
-
|
505
|
+
if not self.quiet:
|
506
|
+
print(f"\n--- Query complete: {len(all_hits):,} submissions retrieved ---")
|
492
507
|
return all_hits
|
493
508
|
|
494
|
-
def query_efts(cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, callback=None):
|
509
|
+
def query_efts(cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, callback=None, quiet=False):
|
495
510
|
"""
|
496
511
|
Convenience function to run a query without managing the async context.
|
497
512
|
"""
|
498
513
|
async def run_query():
|
499
|
-
query = EFTSQuery(requests_per_second=requests_per_second)
|
514
|
+
query = EFTSQuery(requests_per_second=requests_per_second, quiet=quiet)
|
500
515
|
return await query.query(cik, submission_type, filing_date, callback)
|
501
516
|
|
502
517
|
return asyncio.run(run_query())
|
@@ -9,8 +9,8 @@ class TextSearchEFTSQuery(EFTSQuery):
|
|
9
9
|
"""
|
10
10
|
Extended EFTSQuery class that adds text search capabilities.
|
11
11
|
"""
|
12
|
-
def __init__(self, text_query, requests_per_second=5.0):
|
13
|
-
super().__init__(requests_per_second=requests_per_second)
|
12
|
+
def __init__(self, text_query, requests_per_second=5.0, quiet=False):
|
13
|
+
super().__init__(requests_per_second=requests_per_second, quiet=quiet)
|
14
14
|
self.text_query = text_query
|
15
15
|
|
16
16
|
def _prepare_params(self, cik=None, submission_type=None, filing_date=None):
|
@@ -46,7 +46,7 @@ async def extract_accession_numbers(hits):
|
|
46
46
|
accession_numbers.append(acc_no)
|
47
47
|
return accession_numbers
|
48
48
|
|
49
|
-
def query(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0):
|
49
|
+
def query(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, quiet=False):
|
50
50
|
"""
|
51
51
|
Search SEC filings for text and return the full search results.
|
52
52
|
|
@@ -66,6 +66,8 @@ def query(text_query, cik=None, submission_type=None, filing_date=None, requests
|
|
66
66
|
requests_per_second : float, optional
|
67
67
|
Maximum number of requests per second to make to the SEC API.
|
68
68
|
Default is 5.0.
|
69
|
+
quiet : bool, optional
|
70
|
+
If True, suppresses all output (progress bars and prints). Default is False.
|
69
71
|
|
70
72
|
Returns:
|
71
73
|
--------
|
@@ -73,12 +75,12 @@ def query(text_query, cik=None, submission_type=None, filing_date=None, requests
|
|
73
75
|
Complete search results with all hit data.
|
74
76
|
"""
|
75
77
|
async def run_query():
|
76
|
-
query = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second)
|
78
|
+
query = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second, quiet=quiet)
|
77
79
|
return await query.query(cik, submission_type, filing_date)
|
78
80
|
|
79
81
|
return asyncio.run(run_query())
|
80
82
|
|
81
|
-
def filter_text(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0):
|
83
|
+
def filter_text(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, quiet=False):
|
82
84
|
"""
|
83
85
|
Search SEC filings for text and return matching accession numbers.
|
84
86
|
|
@@ -98,6 +100,8 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
|
|
98
100
|
requests_per_second : float, optional
|
99
101
|
Maximum number of requests per second to make to the SEC API.
|
100
102
|
Default is 5.0.
|
103
|
+
quiet : bool, optional
|
104
|
+
If True, suppresses all output (progress bars and prints). Default is False.
|
101
105
|
|
102
106
|
Returns:
|
103
107
|
--------
|
@@ -105,7 +109,7 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
|
|
105
109
|
List of accession numbers (as strings) for filings that match the text query.
|
106
110
|
"""
|
107
111
|
async def run_query():
|
108
|
-
query_obj = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second)
|
112
|
+
query_obj = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second, quiet=quiet)
|
109
113
|
|
110
114
|
# Create a collector for accession numbers
|
111
115
|
all_acc_nos = []
|
@@ -1,8 +1,8 @@
|
|
1
1
|
from pathlib import Path
|
2
|
-
from
|
3
|
-
from
|
2
|
+
from .helper import _process_cik_and_metadata_filters, load_package_dataset
|
3
|
+
from .sec.xbrl.downloadcompanyfacts import download_company_facts
|
4
4
|
|
5
|
-
class
|
5
|
+
class Sheet:
|
6
6
|
def __init__(self, path):
|
7
7
|
self.path = Path(path)
|
8
8
|
|
@@ -1,11 +1,11 @@
|
|
1
|
-
datamule/__init__.py,sha256=
|
1
|
+
datamule/__init__.py,sha256=l6YlwT5EeRxPlCtO5Jd4I8l266rSRUJyfFe97cRtSCM,991
|
2
2
|
datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
3
3
|
datamule/document.py,sha256=BC8jdVy9pMOA9ghIqV5N2XJidmVNThqbBohsuSAnVoY,10813
|
4
4
|
datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
|
5
|
+
datamule/index.py,sha256=0txvbzPcvY1GsdxA-wGdLzAByxSeE_1VyyBp9mZEQRM,2292
|
5
6
|
datamule/portfolio.py,sha256=JmZlTrom_g7FXKXxWp_CiQTyC7p6_cDP08G0kFUja48,6982
|
7
|
+
datamule/sheet.py,sha256=WwumRdniClGU7W3AXVLOpCdMnepLC7KMrRpQlA6_NUY,1022
|
6
8
|
datamule/submission.py,sha256=JsxYlEz1Ywu6eC32OS15p4p-p8qB6SWd_rXuf2p5UfY,1247
|
7
|
-
datamule/book/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
datamule/book/book.py,sha256=QWiowVNqb84o-JcVo0fpKumxnIbBge2ZeKwHxqkVMqw,1023
|
9
9
|
datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
|
11
11
|
datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
|
@@ -17,10 +17,10 @@ datamule/sec/rss/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
17
17
|
datamule/sec/rss/monitor.py,sha256=6r4EYaSlGu6VYErlj9zXJsIMLVie1cfacSZU-ESfuBI,18231
|
18
18
|
datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
datamule/sec/submissions/downloader.py,sha256=HxbSkNotLLW6ROmU30rnXPlCo9gY3SoB1Z4ZWvj9FIY,2669
|
20
|
-
datamule/sec/submissions/eftsquery.py,sha256=
|
20
|
+
datamule/sec/submissions/eftsquery.py,sha256=v6YMBZzksqweqHnNIllMFN-frWypAgvZPKx2FH1UrL4,22515
|
21
21
|
datamule/sec/submissions/monitor.py,sha256=XkwH5nvzr_dNttmFRQ52m7344IKbOtWDfOZIEdie4H8,5234
|
22
22
|
datamule/sec/submissions/streamer.py,sha256=hc61le7gGIIWp1KEaOv_PhriUxf7YYFkQrSKELlZ3pg,9748
|
23
|
-
datamule/sec/submissions/textsearch.py,sha256=
|
23
|
+
datamule/sec/submissions/textsearch.py,sha256=oEIUrcO3HW-4dcyPCiOTvM7UUimNEM4HNIb-Juvc1BQ,4642
|
24
24
|
datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
25
|
datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
|
26
26
|
datamule/sec/xbrl/filter_xbrl.py,sha256=g9OT4zrNS0tiUJeBIwbCs_zMisOBkpFnMR3tV4Tr39Q,1316
|
@@ -29,7 +29,7 @@ datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTq
|
|
29
29
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
30
|
datamule/seclibrary/downloader.py,sha256=Zb1TxsIz887tO3MJVP66siYVtNus89ti-g9oZ6VywrM,11500
|
31
31
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
32
|
-
datamule-1.1.
|
33
|
-
datamule-1.1.
|
34
|
-
datamule-1.1.
|
35
|
-
datamule-1.1.
|
32
|
+
datamule-1.1.5.dist-info/METADATA,sha256=9Q8YzsBipVuGYN4eWmH49sF5oyouyZvVdJ6rncDa0VE,512
|
33
|
+
datamule-1.1.5.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
34
|
+
datamule-1.1.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
35
|
+
datamule-1.1.5.dist-info/RECORD,,
|
datamule/book/__init__.py
DELETED
File without changes
|
File without changes
|
File without changes
|