datamule 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document.py +29 -44
- datamule/portfolio.py +6 -2
- datamule/sec/submissions/downloader.py +19 -2
- datamule/sec/submissions/eftsquery.py +129 -8
- datamule/sec/submissions/monitor.py +3 -3
- datamule/sec/submissions/streamer.py +59 -23
- datamule/sec/submissions/textsearch.py +33 -6
- datamule/sheet.py +8 -1
- datamule/submission.py +93 -19
- {datamule-1.1.5.dist-info → datamule-1.1.7.dist-info}/METADATA +1 -1
- {datamule-1.1.5.dist-info → datamule-1.1.7.dist-info}/RECORD +13 -13
- {datamule-1.1.5.dist-info → datamule-1.1.7.dist-info}/WHEEL +0 -0
- {datamule-1.1.5.dist-info → datamule-1.1.7.dist-info}/top_level.txt +0 -0
datamule/document.py
CHANGED
@@ -8,31 +8,34 @@ from .mapping_dicts.xml_mapping_dicts import dict_345
|
|
8
8
|
from selectolax.parser import HTMLParser
|
9
9
|
|
10
10
|
class Document:
|
11
|
-
def __init__(self, type,
|
11
|
+
def __init__(self, type, content, extension):
|
12
|
+
|
12
13
|
self.type = type
|
13
|
-
|
14
|
+
# we will remove this later #
|
15
|
+
# make sure extension is in lower case
|
16
|
+
extension = extension.lower()
|
17
|
+
self.content = content
|
18
|
+
if extension == '.txt':
|
19
|
+
self.content = self._preprocess_txt_content()
|
20
|
+
elif extension in ['.htm', '.html']:
|
21
|
+
self.content = self._preprocess_html_content()
|
14
22
|
|
23
|
+
self.extension = extension
|
24
|
+
# this will be filled by parsed
|
15
25
|
self.data = None
|
16
|
-
self.content = None
|
17
|
-
|
18
26
|
|
19
|
-
|
20
|
-
|
21
|
-
self.content
|
22
|
-
|
23
|
-
def _load_text_content(self):
|
24
|
-
with open(self.path) as f:
|
25
|
-
return f.read().translate(str.maketrans({
|
27
|
+
#_load_text_content
|
28
|
+
def _preprocess_txt_content(self):
|
29
|
+
return self.content.read().translate(str.maketrans({
|
26
30
|
'\xa0': ' ', '\u2003': ' ',
|
27
31
|
'\u2018': "'", '\u2019': "'",
|
28
32
|
'\u201c': '"', '\u201d': '"'
|
29
33
|
}))
|
30
34
|
|
31
35
|
# will deprecate this when we add html2dict
|
32
|
-
def
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
+
def _preprocess_html_content(self):
|
37
|
+
parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
|
38
|
+
|
36
39
|
# Remove hidden elements first
|
37
40
|
hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
|
38
41
|
for node in hidden_nodes:
|
@@ -83,20 +86,9 @@ class Document:
|
|
83
86
|
'\u201c': '"', '\u201d': '"'
|
84
87
|
}))
|
85
88
|
|
86
|
-
def _load_file_content(self):
|
87
|
-
if self.path.suffix =='.txt':
|
88
|
-
self.content = self._load_text_content()
|
89
|
-
elif self.path.suffix in ['.html','.htm']:
|
90
|
-
self.content = self._load_html_content()
|
91
|
-
else:
|
92
|
-
raise ValueError(f"Unsupported file type: {self.path.suffix}")
|
93
|
-
|
94
|
-
|
95
89
|
def contains_string(self, pattern):
|
96
|
-
"""
|
97
|
-
if self.
|
98
|
-
if self.content is None:
|
99
|
-
self.content = self._load_file_content(self.path)
|
90
|
+
"""Works for select files"""
|
91
|
+
if self.extension in ['.htm', '.html', '.txt','.xml']:
|
100
92
|
return bool(re.search(pattern, self.content))
|
101
93
|
return False
|
102
94
|
|
@@ -104,15 +96,14 @@ class Document:
|
|
104
96
|
def parse(self):
|
105
97
|
mapping_dict = None
|
106
98
|
|
107
|
-
if self.
|
99
|
+
if self.extension == '.xml':
|
108
100
|
if self.type in ['3', '4', '5']:
|
109
101
|
mapping_dict = dict_345
|
110
102
|
|
111
|
-
self.load_content()
|
112
103
|
self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
|
104
|
+
|
113
105
|
# will deprecate this when we add html2dict
|
114
|
-
elif self.
|
115
|
-
self._load_file_content()
|
106
|
+
elif self.extension in ['.htm', '.html','.txt']:
|
116
107
|
|
117
108
|
if self.type == '10-K':
|
118
109
|
mapping_dict = dict_10k
|
@@ -133,18 +124,12 @@ class Document:
|
|
133
124
|
if not self.data:
|
134
125
|
self.parse()
|
135
126
|
|
136
|
-
if output_filename is None:
|
137
|
-
output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
|
138
|
-
|
139
127
|
with open(output_filename, 'w',encoding='utf-8') as f:
|
140
128
|
json.dump(self.data, f, indent=2)
|
141
129
|
|
142
130
|
def write_csv(self, output_filename=None, accession_number=None):
|
143
131
|
self.parse()
|
144
132
|
|
145
|
-
if output_filename is None:
|
146
|
-
output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
|
147
|
-
|
148
133
|
with open(output_filename, 'w', newline='') as csvfile:
|
149
134
|
if not self.data:
|
150
135
|
return output_filename
|
@@ -165,7 +150,7 @@ class Document:
|
|
165
150
|
writer.writeheader()
|
166
151
|
for row in self.data:
|
167
152
|
if accession_number:
|
168
|
-
row['Accession Number'] =
|
153
|
+
row['Accession Number'] = accession_number
|
169
154
|
writer.writerow(row)
|
170
155
|
|
171
156
|
return output_filename
|
@@ -225,7 +210,7 @@ class Document:
|
|
225
210
|
# Let's remove XML iterable for now
|
226
211
|
|
227
212
|
# Handle text-based documents
|
228
|
-
if self.
|
213
|
+
if self.extension in ['.txt', '.htm', '.html']:
|
229
214
|
document_data = self.data
|
230
215
|
if not document_data:
|
231
216
|
return iter([])
|
@@ -235,13 +220,13 @@ class Document:
|
|
235
220
|
section_type = None
|
236
221
|
|
237
222
|
if self.type in ['10-K', '10-Q']:
|
238
|
-
mapping_dict =
|
223
|
+
mapping_dict = dict_10k if self.type == '10-K' else dict_10q
|
239
224
|
elif self.type == '8-K':
|
240
|
-
mapping_dict =
|
225
|
+
mapping_dict = dict_8k
|
241
226
|
elif self.type == 'SC 13D':
|
242
|
-
mapping_dict =
|
227
|
+
mapping_dict = dict_13d
|
243
228
|
elif self.type == 'SC 13G':
|
244
|
-
mapping_dict =
|
229
|
+
mapping_dict = dict_13g
|
245
230
|
else:
|
246
231
|
return iter([])
|
247
232
|
|
datamule/portfolio.py
CHANGED
@@ -142,7 +142,7 @@ class Portfolio:
|
|
142
142
|
cik=cik,
|
143
143
|
submission_type=submission_type,
|
144
144
|
filing_date=filing_date,
|
145
|
-
requests_per_second=5,
|
145
|
+
requests_per_second=5,
|
146
146
|
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
147
147
|
)
|
148
148
|
|
@@ -179,4 +179,8 @@ class Portfolio:
|
|
179
179
|
document_types = [document_types]
|
180
180
|
|
181
181
|
for submission in self.submissions:
|
182
|
-
yield from submission.document_type(document_types)
|
182
|
+
yield from submission.document_type(document_types)
|
183
|
+
|
184
|
+
def keep(self,document_type):
|
185
|
+
for submission in self.__iter__():
|
186
|
+
submission.keep(document_type)
|
@@ -36,7 +36,8 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings")
|
|
36
36
|
print(f"Error processing {accno}: {e}")
|
37
37
|
return None
|
38
38
|
|
39
|
-
def download(cik=None, submission_type=None, filing_date=None,
|
39
|
+
def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
|
40
|
+
requests_per_second=5, output_dir="filings", accession_numbers=None, quiet=False):
|
40
41
|
"""
|
41
42
|
Download SEC EDGAR filings and extract their documents.
|
42
43
|
|
@@ -44,12 +45,25 @@ def download(cik=None, submission_type=None, filing_date=None, requests_per_seco
|
|
44
45
|
- cik: CIK number(s) to query for
|
45
46
|
- submission_type: Filing type(s) to query for (default: 10-K)
|
46
47
|
- filing_date: Date or date range to query for
|
48
|
+
- location: Location code to filter by (e.g., 'CA' for California)
|
49
|
+
- name: Company name to search for (alternative to providing CIK)
|
47
50
|
- requests_per_second: Rate limit for SEC requests
|
48
51
|
- output_dir: Directory to save documents
|
49
52
|
- accession_numbers: Optional list of accession numbers to filter by
|
53
|
+
- quiet: Whether to suppress progress output
|
50
54
|
|
51
55
|
Returns:
|
52
56
|
- List of all document paths processed
|
57
|
+
|
58
|
+
Examples:
|
59
|
+
# Download filings by CIK
|
60
|
+
download(cik="1318605", submission_type="10-K")
|
61
|
+
|
62
|
+
# Download filings by company name
|
63
|
+
download(name="Tesla", submission_type="10-K")
|
64
|
+
|
65
|
+
# Download filings with location filter
|
66
|
+
download(name="Apple", location="CA", submission_type="10-K")
|
53
67
|
"""
|
54
68
|
|
55
69
|
# Make sure output directory exists
|
@@ -62,9 +76,12 @@ def download(cik=None, submission_type=None, filing_date=None, requests_per_seco
|
|
62
76
|
# Call the stream function with our callback
|
63
77
|
return stream(
|
64
78
|
cik=cik,
|
79
|
+
name=name,
|
65
80
|
submission_type=submission_type,
|
66
81
|
filing_date=filing_date,
|
82
|
+
location=location,
|
67
83
|
requests_per_second=requests_per_second,
|
68
84
|
document_callback=callback_wrapper,
|
69
|
-
accession_numbers=accession_numbers
|
85
|
+
accession_numbers=accession_numbers,
|
86
|
+
quiet=quiet
|
70
87
|
)
|
@@ -42,6 +42,67 @@ class EFTSQuery:
|
|
42
42
|
await self.session.close()
|
43
43
|
self.session = None
|
44
44
|
|
45
|
+
async def search_name(self, name):
|
46
|
+
"""
|
47
|
+
Search for companies by name using the EFTS name search endpoint.
|
48
|
+
|
49
|
+
Parameters:
|
50
|
+
name (str): Company name to search for
|
51
|
+
|
52
|
+
Returns:
|
53
|
+
list: List of dictionaries containing company information (entity, id, tickers if available)
|
54
|
+
"""
|
55
|
+
if not self.session:
|
56
|
+
raise RuntimeError("No active session. This method must be called within an async context.")
|
57
|
+
|
58
|
+
url = f"{self.base_url}?keysTyped={name}"
|
59
|
+
|
60
|
+
if not self.quiet:
|
61
|
+
print(f"Searching for company: {name}")
|
62
|
+
|
63
|
+
async with self.limiter:
|
64
|
+
try:
|
65
|
+
async with self.session.get(url) as response:
|
66
|
+
if response.status == 429:
|
67
|
+
raise RetryException(url)
|
68
|
+
response.raise_for_status()
|
69
|
+
content = await response.read()
|
70
|
+
await self.rate_monitor.add_request(len(content))
|
71
|
+
data = await response.json()
|
72
|
+
|
73
|
+
if 'hits' in data and 'hits' in data['hits']:
|
74
|
+
hits = data['hits']['hits']
|
75
|
+
results = []
|
76
|
+
|
77
|
+
for hit in hits:
|
78
|
+
source = hit.get('_source', {})
|
79
|
+
result = {
|
80
|
+
'entity': source.get('entity', ''),
|
81
|
+
'id': hit.get('_id', ''),
|
82
|
+
'tickers': source.get('tickers', '')
|
83
|
+
}
|
84
|
+
results.append(result)
|
85
|
+
|
86
|
+
if not self.quiet and results:
|
87
|
+
# Create a compact display of results
|
88
|
+
display_results = [f"{r['entity']} [{r['id']}]" for r in results]
|
89
|
+
print(f"Name matches: {', '.join(display_results[:5])}")
|
90
|
+
if len(results) > 5:
|
91
|
+
print(f"...and {len(results) - 5} more matches")
|
92
|
+
|
93
|
+
return results
|
94
|
+
return []
|
95
|
+
except aiohttp.ClientResponseError as e:
|
96
|
+
if e.status == 429:
|
97
|
+
raise RetryException(url)
|
98
|
+
if not self.quiet:
|
99
|
+
print(f"Error searching for company: {str(e)}")
|
100
|
+
return []
|
101
|
+
except Exception as e:
|
102
|
+
if not self.quiet:
|
103
|
+
print(f"Error searching for company: {str(e)}")
|
104
|
+
return []
|
105
|
+
|
45
106
|
def _get_form_exclusions(self, form):
|
46
107
|
"""Dynamically generate form exclusions based on patterns"""
|
47
108
|
# Skip already negated forms
|
@@ -55,7 +116,7 @@ class EFTSQuery:
|
|
55
116
|
# No exclusions for amendment forms
|
56
117
|
return []
|
57
118
|
|
58
|
-
def _prepare_params(self, cik=None, submission_type=None, filing_date=None):
|
119
|
+
def _prepare_params(self, cik=None, submission_type=None, filing_date=None, location=None):
|
59
120
|
params = {}
|
60
121
|
|
61
122
|
# Handle CIK
|
@@ -111,6 +172,10 @@ class EFTSQuery:
|
|
111
172
|
params['startdt'] = "2001-01-01"
|
112
173
|
params['enddt'] = datetime.now().strftime('%Y-%m-%d')
|
113
174
|
|
175
|
+
# Handle location filtering
|
176
|
+
if location:
|
177
|
+
params['filter_location'] = location
|
178
|
+
|
114
179
|
return params
|
115
180
|
|
116
181
|
def _get_query_description(self, params):
|
@@ -125,6 +190,9 @@ class EFTSQuery:
|
|
125
190
|
if 'startdt' in params and 'enddt' in params:
|
126
191
|
parts.append(f"dates={params['startdt']} to {params['enddt']}")
|
127
192
|
|
193
|
+
if 'filter_location' in params:
|
194
|
+
parts.append(f"location={params['filter_location']}")
|
195
|
+
|
128
196
|
return ", ".join(parts)
|
129
197
|
|
130
198
|
async def _fetch_json(self, url):
|
@@ -413,12 +481,26 @@ class EFTSQuery:
|
|
413
481
|
for params, from_val, size_val, callback in self.pending_page_requests:
|
414
482
|
await self.fetch_queue.put((params, from_val, size_val, callback))
|
415
483
|
|
416
|
-
async def query(self, cik=None, submission_type=None, filing_date=None, callback=None):
|
417
|
-
|
418
|
-
|
484
|
+
async def query(self, cik=None, submission_type=None, filing_date=None, location=None, callback=None, name=None):
|
485
|
+
"""
|
486
|
+
Query SEC filings using the EFTS API.
|
487
|
+
|
488
|
+
Parameters:
|
489
|
+
cik (str or list): Central Index Key(s) for the company
|
490
|
+
submission_type (str or list): Filing form type(s) to filter by
|
491
|
+
filing_date (str, tuple, or list): Date or date range to filter by
|
492
|
+
location (str): Location code to filter by (e.g., 'CA' for California)
|
493
|
+
callback (function): Async callback function to process results as they arrive
|
494
|
+
name (str): Company name to search for (alternative to providing CIK)
|
419
495
|
|
420
|
-
|
421
|
-
|
496
|
+
Returns:
|
497
|
+
list: List of filing documents matching the query criteria
|
498
|
+
"""
|
499
|
+
# If both CIK and name are provided, raise an error
|
500
|
+
if cik is not None and name is not None:
|
501
|
+
raise ValueError("Please provide either 'name' or 'cik', not both")
|
502
|
+
|
503
|
+
all_hits = []
|
422
504
|
|
423
505
|
# Collector callback to gather all hits
|
424
506
|
async def collect_hits(hits):
|
@@ -427,6 +509,25 @@ class EFTSQuery:
|
|
427
509
|
await callback(hits)
|
428
510
|
|
429
511
|
async with self as client:
|
512
|
+
# If name is provided, search for matching companies inside the context manager
|
513
|
+
if name is not None:
|
514
|
+
company_results = await self.search_name(name)
|
515
|
+
if not company_results:
|
516
|
+
if not self.quiet:
|
517
|
+
print(f"No companies found matching: {name}")
|
518
|
+
return []
|
519
|
+
|
520
|
+
# Use the first (best) match's CIK
|
521
|
+
cik = company_results[0]['id']
|
522
|
+
if not self.quiet:
|
523
|
+
print(f"Using CIK {cik} for {company_results[0]['entity']}")
|
524
|
+
|
525
|
+
# Now prepare parameters with the CIK (either provided directly or from name search)
|
526
|
+
params = self._prepare_params(cik, submission_type, filing_date, location)
|
527
|
+
|
528
|
+
# Check if this is a primary documents query
|
529
|
+
self.was_primary_docs_query = '-0' in params.get('forms', '').split(',')
|
530
|
+
|
430
531
|
# Reset state for new query
|
431
532
|
self.total_results_to_fetch = 0
|
432
533
|
self.pending_page_requests = []
|
@@ -506,12 +607,32 @@ class EFTSQuery:
|
|
506
607
|
print(f"\n--- Query complete: {len(all_hits):,} submissions retrieved ---")
|
507
608
|
return all_hits
|
508
609
|
|
509
|
-
def query_efts(cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, callback=None, quiet=False):
|
610
|
+
def query_efts(cik=None, submission_type=None, filing_date=None, location=None, requests_per_second=5.0, callback=None, quiet=False, name=None):
|
510
611
|
"""
|
511
612
|
Convenience function to run a query without managing the async context.
|
613
|
+
|
614
|
+
Parameters:
|
615
|
+
cik (str or list): Central Index Key(s) for the company
|
616
|
+
submission_type (str or list): Filing form type(s) to filter by
|
617
|
+
filing_date (str, tuple, or list): Date or date range to filter by
|
618
|
+
location (str): Location code to filter by (e.g., 'CA' for California)
|
619
|
+
requests_per_second (float): Maximum requests per second to make to the SEC API
|
620
|
+
callback (function): Async callback function to process results as they arrive
|
621
|
+
quiet (bool): Whether to suppress progress output
|
622
|
+
name (str): Company name to search for (alternative to providing CIK)
|
623
|
+
|
624
|
+
Returns:
|
625
|
+
list: List of filing documents matching the query criteria
|
626
|
+
|
627
|
+
Example:
|
628
|
+
To search by company name:
|
629
|
+
results = query_efts(name="Tesla", submission_type="10-K")
|
630
|
+
|
631
|
+
To search by CIK:
|
632
|
+
results = query_efts(cik="1318605", submission_type="10-K")
|
512
633
|
"""
|
513
634
|
async def run_query():
|
514
635
|
query = EFTSQuery(requests_per_second=requests_per_second, quiet=quiet)
|
515
|
-
return await query.query(cik, submission_type, filing_date, callback)
|
636
|
+
return await query.query(cik, submission_type, filing_date, location, callback, name)
|
516
637
|
|
517
638
|
return asyncio.run(run_query())
|
@@ -5,7 +5,7 @@ from ..rss.monitor import start_monitor # Import start_monitor directly
|
|
5
5
|
import pytz
|
6
6
|
|
7
7
|
|
8
|
-
async def _process_efts_hits(hits, collected_accession_numbers, data_callback=None):
|
8
|
+
async def _process_efts_hits(hits, collected_accession_numbers, data_callback=None,rate_limiter=None):
|
9
9
|
"""Process EFTS hits, collect accession numbers, and call data callback."""
|
10
10
|
processed_hits = []
|
11
11
|
|
@@ -36,7 +36,7 @@ async def _process_efts_hits(hits, collected_accession_numbers, data_callback=No
|
|
36
36
|
|
37
37
|
# Call data callback if provided
|
38
38
|
if data_callback and processed_hits:
|
39
|
-
await data_callback(processed_hits)
|
39
|
+
await data_callback(processed_hits, rate_limiter)
|
40
40
|
|
41
41
|
return processed_hits
|
42
42
|
|
@@ -61,7 +61,7 @@ async def _master_monitor_impl(data_callback=None, poll_callback=None, submissio
|
|
61
61
|
|
62
62
|
# Prepare a wrapper callback to collect accession numbers
|
63
63
|
async def process_callback(hits):
|
64
|
-
|
64
|
+
await _process_efts_hits(hits, collected_accession_numbers, data_callback, efts_query.limiter)
|
65
65
|
|
66
66
|
# Create an EFTSQuery instance
|
67
67
|
efts_query = EFTSQuery(requests_per_second=requests_per_second)
|
@@ -21,8 +21,8 @@ def fix_filing_url(url):
|
|
21
21
|
return url
|
22
22
|
|
23
23
|
class Streamer(EFTSQuery):
|
24
|
-
def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None):
|
25
|
-
super().__init__(requests_per_second=requests_per_second)
|
24
|
+
def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None, quiet=False):
|
25
|
+
super().__init__(requests_per_second=requests_per_second, quiet=quiet)
|
26
26
|
self.document_callback = document_callback
|
27
27
|
self.document_queue = asyncio.Queue()
|
28
28
|
self.download_in_progress = asyncio.Event()
|
@@ -57,12 +57,14 @@ class Streamer(EFTSQuery):
|
|
57
57
|
await callback(hits)
|
58
58
|
self.fetch_queue.task_done()
|
59
59
|
except Exception as e:
|
60
|
-
|
60
|
+
if not self.quiet:
|
61
|
+
print(f"\nError fetching {url}: {str(e)}")
|
61
62
|
self.fetch_queue.task_done()
|
62
63
|
except asyncio.CancelledError:
|
63
64
|
break
|
64
65
|
except Exception as e:
|
65
|
-
|
66
|
+
if not self.quiet:
|
67
|
+
print(f"\nWorker error: {str(e)}")
|
66
68
|
self.fetch_queue.task_done()
|
67
69
|
|
68
70
|
def _construct_submission_url(self, hit):
|
@@ -85,7 +87,8 @@ class Streamer(EFTSQuery):
|
|
85
87
|
|
86
88
|
return url, cik, accno_w_dash
|
87
89
|
except (KeyError, IndexError) as e:
|
88
|
-
|
90
|
+
if not self.quiet:
|
91
|
+
print(f"Error constructing URL for hit: {hit}. Error: {str(e)}")
|
89
92
|
return None, None, None
|
90
93
|
|
91
94
|
async def _document_download_worker(self):
|
@@ -115,13 +118,15 @@ class Streamer(EFTSQuery):
|
|
115
118
|
|
116
119
|
self.document_queue.task_done()
|
117
120
|
except Exception as e:
|
118
|
-
|
121
|
+
if not self.quiet:
|
122
|
+
print(f"\nError streaming document {doc_url}: {str(e)}")
|
119
123
|
self.document_queue.task_done()
|
120
124
|
|
121
125
|
except asyncio.CancelledError:
|
122
126
|
break
|
123
127
|
except Exception as e:
|
124
|
-
|
128
|
+
if not self.quiet:
|
129
|
+
print(f"\nDocument worker error: {str(e)}")
|
125
130
|
self.document_queue.task_done()
|
126
131
|
|
127
132
|
async def document_download_callback(self, hits):
|
@@ -133,7 +138,7 @@ class Streamer(EFTSQuery):
|
|
133
138
|
self.download_in_progress.set()
|
134
139
|
|
135
140
|
# Create progress bar for documents if not exists
|
136
|
-
if not self.document_pbar:
|
141
|
+
if not self.document_pbar and not self.quiet:
|
137
142
|
self.document_pbar = tqdm(total=0, desc="Streaming submissions")
|
138
143
|
|
139
144
|
# Queue up the documents for download
|
@@ -141,7 +146,8 @@ class Streamer(EFTSQuery):
|
|
141
146
|
doc_url, cik, accno = self._construct_submission_url(hit)
|
142
147
|
if doc_url:
|
143
148
|
# Update document progress bar total
|
144
|
-
self.document_pbar
|
149
|
+
if self.document_pbar:
|
150
|
+
self.document_pbar.total += 1
|
145
151
|
self.total_documents += 1
|
146
152
|
|
147
153
|
# Add to download queue
|
@@ -159,8 +165,20 @@ class Streamer(EFTSQuery):
|
|
159
165
|
# Signal that document download is complete
|
160
166
|
self.download_in_progress.clear()
|
161
167
|
|
162
|
-
async def stream(self, cik=None, submission_type=None, filing_date=None):
|
163
|
-
"""
|
168
|
+
async def stream(self, cik=None, submission_type=None, filing_date=None, location=None, name=None):
|
169
|
+
"""
|
170
|
+
Main method to stream EFTS results and download documents
|
171
|
+
|
172
|
+
Parameters:
|
173
|
+
cik (str or list): Central Index Key(s) for the company
|
174
|
+
submission_type (str or list): Filing form type(s) to filter by
|
175
|
+
filing_date (str, tuple, or list): Date or date range to filter by
|
176
|
+
location (str): Location code to filter by (e.g., 'CA' for California)
|
177
|
+
name (str): Company name to search for (alternative to providing CIK)
|
178
|
+
|
179
|
+
Returns:
|
180
|
+
list: List of all EFTS hits processed
|
181
|
+
"""
|
164
182
|
# Create document worker tasks
|
165
183
|
self.document_workers = [
|
166
184
|
asyncio.create_task(self._document_download_worker())
|
@@ -173,11 +191,12 @@ class Streamer(EFTSQuery):
|
|
173
191
|
self.skipped_documents = 0
|
174
192
|
|
175
193
|
# Run the main query with our document download callback
|
176
|
-
results = await self.query(cik, submission_type, filing_date, self.document_download_callback)
|
194
|
+
results = await self.query(cik, submission_type, filing_date, location, self.document_download_callback, name)
|
177
195
|
|
178
196
|
# Make sure all document downloads are complete
|
179
197
|
if self.download_in_progress.is_set():
|
180
|
-
|
198
|
+
if not self.quiet:
|
199
|
+
print("Waiting for remaining document downloads to complete...")
|
181
200
|
await self.document_queue.join()
|
182
201
|
|
183
202
|
# Clean up document workers
|
@@ -190,14 +209,17 @@ class Streamer(EFTSQuery):
|
|
190
209
|
if self.document_pbar:
|
191
210
|
self.document_pbar.close()
|
192
211
|
self.document_pbar = None # Set to None to prevent reuse
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
212
|
+
|
213
|
+
if not self.quiet:
|
214
|
+
print(f"\n--- Streaming complete: {len(results)} EFTS results processed ---")
|
215
|
+
if self.accession_numbers is not None:
|
216
|
+
print(f"--- {self.documents_processed} documents downloaded, {self.skipped_documents} skipped due to accession number filter ---")
|
217
|
+
|
197
218
|
return results
|
198
219
|
|
199
|
-
def stream(cik=None, submission_type=None, filing_date=None,
|
200
|
-
|
220
|
+
def stream(cik=None, submission_type=None, filing_date=None, location=None,
|
221
|
+
requests_per_second=5.0, document_callback=None, accession_numbers=None,
|
222
|
+
quiet=False, name=None):
|
201
223
|
"""
|
202
224
|
Stream EFTS results and download documents into memory.
|
203
225
|
|
@@ -205,15 +227,28 @@ def stream(cik=None, submission_type=None, filing_date=None,
|
|
205
227
|
- cik: CIK number(s) to query for
|
206
228
|
- submission_type: Filing type(s) to query for
|
207
229
|
- filing_date: Date or date range to query for
|
230
|
+
- location: Location code to filter by (e.g., 'CA' for California)
|
208
231
|
- requests_per_second: Rate limit for SEC requests (combined EFTS and document downloads)
|
209
232
|
- document_callback: Callback function that receives (hit, content, cik, accno, url)
|
210
233
|
- accession_numbers: Optional list of accession numbers to filter by
|
234
|
+
- quiet: Whether to suppress progress output
|
235
|
+
- name: Company name to search for (alternative to providing CIK)
|
211
236
|
|
212
237
|
Returns:
|
213
238
|
- List of all EFTS hits processed
|
239
|
+
|
240
|
+
Example:
|
241
|
+
To search by company name:
|
242
|
+
results = stream(name="Tesla", submission_type="10-K")
|
243
|
+
|
244
|
+
To search by CIK:
|
245
|
+
results = stream(cik="1318605", submission_type="10-K")
|
246
|
+
|
247
|
+
To search with location filter:
|
248
|
+
results = stream(name="Tesla", location="CA", submission_type="10-K")
|
214
249
|
"""
|
215
|
-
|
216
|
-
#
|
250
|
+
|
251
|
+
# Check if acc no is empty list
|
217
252
|
if accession_numbers == []:
|
218
253
|
raise ValueError("Applied filter resulted in empty accession numbers list")
|
219
254
|
|
@@ -221,8 +256,9 @@ def stream(cik=None, submission_type=None, filing_date=None,
|
|
221
256
|
streamer = Streamer(
|
222
257
|
requests_per_second=requests_per_second,
|
223
258
|
document_callback=document_callback,
|
224
|
-
accession_numbers=accession_numbers
|
259
|
+
accession_numbers=accession_numbers,
|
260
|
+
quiet=quiet
|
225
261
|
)
|
226
|
-
return await streamer.stream(cik, submission_type, filing_date)
|
262
|
+
return await streamer.stream(cik, submission_type, filing_date, location, name)
|
227
263
|
|
228
264
|
return asyncio.run(run_stream())
|
@@ -13,9 +13,9 @@ class TextSearchEFTSQuery(EFTSQuery):
|
|
13
13
|
super().__init__(requests_per_second=requests_per_second, quiet=quiet)
|
14
14
|
self.text_query = text_query
|
15
15
|
|
16
|
-
def _prepare_params(self, cik=None, submission_type=None, filing_date=None):
|
16
|
+
def _prepare_params(self, cik=None, submission_type=None, filing_date=None, location=None):
|
17
17
|
# Get base parameters from parent class
|
18
|
-
params = super()._prepare_params(cik, submission_type, filing_date)
|
18
|
+
params = super()._prepare_params(cik, submission_type, filing_date, location)
|
19
19
|
|
20
20
|
# Add text query parameter
|
21
21
|
params['q'] = self.text_query
|
@@ -46,7 +46,8 @@ async def extract_accession_numbers(hits):
|
|
46
46
|
accession_numbers.append(acc_no)
|
47
47
|
return accession_numbers
|
48
48
|
|
49
|
-
def query(text_query, cik=None, submission_type=None, filing_date=None,
|
49
|
+
def query(text_query, cik=None, submission_type=None, filing_date=None, location=None,
|
50
|
+
name=None, requests_per_second=5.0, quiet=False):
|
50
51
|
"""
|
51
52
|
Search SEC filings for text and return the full search results.
|
52
53
|
|
@@ -63,6 +64,10 @@ def query(text_query, cik=None, submission_type=None, filing_date=None, requests
|
|
63
64
|
filing_date : str, tuple, list, optional
|
64
65
|
Date or date range to filter by. Can be a single date string ('YYYY-MM-DD'),
|
65
66
|
a tuple of (start_date, end_date), or a list of dates.
|
67
|
+
location : str, optional
|
68
|
+
Location code to filter by (e.g., 'CA' for California).
|
69
|
+
name : str, optional
|
70
|
+
Company name to search for (alternative to providing CIK).
|
66
71
|
requests_per_second : float, optional
|
67
72
|
Maximum number of requests per second to make to the SEC API.
|
68
73
|
Default is 5.0.
|
@@ -73,14 +78,23 @@ def query(text_query, cik=None, submission_type=None, filing_date=None, requests
|
|
73
78
|
--------
|
74
79
|
list
|
75
80
|
Complete search results with all hit data.
|
81
|
+
|
82
|
+
Examples:
|
83
|
+
---------
|
84
|
+
# Search for 'climate risk' in Tesla's 10-K filings using company name
|
85
|
+
results = query('"climate risk"', name='Tesla', submission_type='10-K')
|
86
|
+
|
87
|
+
# Search for 'pandemic' in California companies' filings
|
88
|
+
results = query('pandemic', location='CA', submission_type='8-K')
|
76
89
|
"""
|
77
90
|
async def run_query():
|
78
91
|
query = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second, quiet=quiet)
|
79
|
-
return await query.query(cik, submission_type, filing_date)
|
92
|
+
return await query.query(cik, submission_type, filing_date, location, None, name)
|
80
93
|
|
81
94
|
return asyncio.run(run_query())
|
82
95
|
|
83
|
-
def filter_text(text_query, cik=None, submission_type=None, filing_date=None,
|
96
|
+
def filter_text(text_query, cik=None, submission_type=None, filing_date=None, location=None,
|
97
|
+
name=None, requests_per_second=5.0, quiet=False):
|
84
98
|
"""
|
85
99
|
Search SEC filings for text and return matching accession numbers.
|
86
100
|
|
@@ -97,6 +111,10 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
|
|
97
111
|
filing_date : str, tuple, list, optional
|
98
112
|
Date or date range to filter by. Can be a single date string ('YYYY-MM-DD'),
|
99
113
|
a tuple of (start_date, end_date), or a list of dates.
|
114
|
+
location : str, optional
|
115
|
+
Location code to filter by (e.g., 'CA' for California).
|
116
|
+
name : str, optional
|
117
|
+
Company name to search for (alternative to providing CIK).
|
100
118
|
requests_per_second : float, optional
|
101
119
|
Maximum number of requests per second to make to the SEC API.
|
102
120
|
Default is 5.0.
|
@@ -107,6 +125,15 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
|
|
107
125
|
--------
|
108
126
|
list
|
109
127
|
List of accession numbers (as strings) for filings that match the text query.
|
128
|
+
|
129
|
+
Examples:
|
130
|
+
---------
|
131
|
+
# Get accession numbers of Apple filings mentioning 'supply chain'
|
132
|
+
acc_numbers = filter_text('"supply chain"', name='Apple')
|
133
|
+
|
134
|
+
# Use the accession numbers as a filter in another API
|
135
|
+
from .downloader import download
|
136
|
+
download(name='Apple', accession_numbers=acc_numbers)
|
110
137
|
"""
|
111
138
|
async def run_query():
|
112
139
|
query_obj = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second, quiet=quiet)
|
@@ -119,7 +146,7 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
|
|
119
146
|
all_acc_nos.extend(acc_nos)
|
120
147
|
|
121
148
|
# Run the query with our callback
|
122
|
-
await query_obj.query(cik, submission_type, filing_date, collect_acc_nos)
|
149
|
+
await query_obj.query(cik, submission_type, filing_date, location, collect_acc_nos, name)
|
123
150
|
|
124
151
|
return all_acc_nos
|
125
152
|
|
datamule/sheet.py
CHANGED
@@ -30,5 +30,12 @@ class Sheet:
|
|
30
30
|
pass
|
31
31
|
def query_xbrl():
|
32
32
|
pass
|
33
|
-
|
33
|
+
|
34
|
+
# LIST TUPLE SYNTAX, so e.g. value (0,100) is 0-100, while [0,100] is 0 and 100
|
35
|
+
def get_13fhr(reportingOwnerCIK,nameOfIssuer,titleOfClass,cusip,value,
|
36
|
+
shrsOrPrnAmt_sshPrnamt,shrsOrPrnAmt_sshPrnamtType,investmentDiscretion,otherManager,
|
37
|
+
votingAuthority_Sole,
|
38
|
+
votingAuthority_Shared,
|
39
|
+
votingAuthority_None,
|
40
|
+
filing_date):
|
34
41
|
pass
|
datamule/submission.py
CHANGED
@@ -1,16 +1,37 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
import json
|
3
3
|
from .document import Document
|
4
|
+
from secsgml import parse_sgml_submission_into_memory
|
5
|
+
from pathlib import Path
|
4
6
|
|
5
7
|
class Submission:
|
6
|
-
def __init__(self, path):
|
7
|
-
|
8
|
-
|
8
|
+
def __init__(self, path=None,sgml_content=None,keep_document_types=None):
|
9
|
+
if path is None and sgml_content is None:
|
10
|
+
raise ValueError("Either path or sgml_content must be provided")
|
11
|
+
if path is not None and sgml_content is not None:
|
12
|
+
raise ValueError("Only one of path or sgml_content must be provided")
|
13
|
+
|
14
|
+
if sgml_content is not None:
|
15
|
+
self.path = None
|
16
|
+
self.metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
|
17
|
+
|
18
|
+
for idx,doc in enumerate(self.metadata['documents']):
|
19
|
+
type = doc.get('type')
|
20
|
+
|
21
|
+
# Keep only specified types
|
22
|
+
if keep_document_types is not None and type not in keep_document_types:
|
23
|
+
continue
|
24
|
+
filename = doc.get('filename')
|
25
|
+
extension = Path(filename).suffix
|
26
|
+
self.documents = [Document(type=type, content=raw_documents[idx], extension=extension)]
|
27
|
+
|
28
|
+
|
29
|
+
if path is not None:
|
30
|
+
self.path = Path(path)
|
31
|
+
metadata_path = self.path / 'metadata.json'
|
32
|
+
with metadata_path.open('r') as f:
|
33
|
+
self.metadata = json.load(f)
|
9
34
|
|
10
|
-
def _load_metadata(self):
|
11
|
-
metadata_path = self.path / 'metadata.json'
|
12
|
-
with metadata_path.open('r') as f:
|
13
|
-
self.metadata = json.load(f)
|
14
35
|
|
15
36
|
def document_type(self, document_type):
|
16
37
|
# Convert single document type to list for consistent handling
|
@@ -19,20 +40,73 @@ class Submission:
|
|
19
40
|
else:
|
20
41
|
document_types = document_type
|
21
42
|
|
22
|
-
for doc in self.metadata['documents']:
|
43
|
+
for idx,doc in enumerate(self.metadata['documents']):
|
23
44
|
if doc['type'] in document_types:
|
45
|
+
|
46
|
+
# if loaded from path
|
47
|
+
if self.path is not None:
|
48
|
+
filename = doc.get('filename')
|
49
|
+
# oh we need handling here for sequences case
|
50
|
+
if filename is None:
|
51
|
+
filename = doc['sequence'] + '.txt'
|
52
|
+
|
53
|
+
document_path = self.path / filename
|
54
|
+
extension = document_path.suffix
|
55
|
+
|
56
|
+
with document_path.open('r') as f:
|
57
|
+
content = f.read()
|
58
|
+
|
59
|
+
yield Document(type=doc['type'], content=content, extension=extension)
|
60
|
+
# if loaded from sgml_content
|
61
|
+
else:
|
62
|
+
yield self.documents[idx]
|
63
|
+
|
64
|
+
|
65
|
+
def __iter__(self):
|
66
|
+
for idx,doc in enumerate(self.metadata['documents']):
|
67
|
+
# if loaded from path
|
68
|
+
if self.path is not None:
|
24
69
|
filename = doc.get('filename')
|
70
|
+
|
71
|
+
# oh we need handling here for sequences case
|
25
72
|
if filename is None:
|
26
|
-
|
73
|
+
filename = doc['sequence'] + '.txt'
|
27
74
|
|
28
75
|
document_path = self.path / filename
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
76
|
+
extension = document_path.suffix
|
77
|
+
|
78
|
+
# check if the file exists
|
79
|
+
if document_path.exists():
|
80
|
+
with document_path.open('r') as f:
|
81
|
+
content = f.read()
|
82
|
+
|
83
|
+
yield Document(type=doc['type'], content=content, extension=extension)
|
84
|
+
else:
|
85
|
+
print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
|
86
|
+
|
87
|
+
# if loaded from sgml_content
|
88
|
+
else:
|
89
|
+
yield self.documents[idx]
|
90
|
+
|
91
|
+
# keep documents by document type
|
92
|
+
def keep(self, document_type):
|
93
|
+
# Convert single document type to list for consistent handling
|
94
|
+
if isinstance(document_type, str):
|
95
|
+
document_types = [document_type]
|
96
|
+
else:
|
97
|
+
document_types = document_type
|
98
|
+
|
99
|
+
if self.path is not None:
|
100
|
+
for doc in self.metadata['documents']:
|
101
|
+
filename = doc.get('filename')
|
102
|
+
type = doc.get('type')
|
103
|
+
if type not in document_types:
|
104
|
+
# oh we need handling here for sequences case
|
105
|
+
if filename is None:
|
106
|
+
filename = doc.sequence + '.txt'
|
107
|
+
|
108
|
+
document_path = self.path / filename
|
109
|
+
# delete the file
|
110
|
+
document_path.unlink()
|
111
|
+
else:
|
112
|
+
print("Warning: keep() method is only available when loading from path.")
|
@@ -1,11 +1,11 @@
|
|
1
1
|
datamule/__init__.py,sha256=l6YlwT5EeRxPlCtO5Jd4I8l266rSRUJyfFe97cRtSCM,991
|
2
2
|
datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
3
|
-
datamule/document.py,sha256=
|
3
|
+
datamule/document.py,sha256=7FBmjWJJfdKrbQ4UH4J8It7W5GEWTFFEUfQdODUrYlQ,10160
|
4
4
|
datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
|
5
5
|
datamule/index.py,sha256=0txvbzPcvY1GsdxA-wGdLzAByxSeE_1VyyBp9mZEQRM,2292
|
6
|
-
datamule/portfolio.py,sha256=
|
7
|
-
datamule/sheet.py,sha256=
|
8
|
-
datamule/submission.py,sha256=
|
6
|
+
datamule/portfolio.py,sha256=ECevaiF8P6v4mJ7W9IM4hRKNF0GGdQzc1SzBWLnG2qQ,7082
|
7
|
+
datamule/sheet.py,sha256=FF0JL8BuAZ7Sd_LY_-sCGJuYlhm3sKgj2jlHUGMjeUQ,1406
|
8
|
+
datamule/submission.py,sha256=zWCnucjmfTYcr1Hm9Us-TjGLjWAHuRPtIyaVpLNvs4c,4427
|
9
9
|
datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
|
11
11
|
datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
|
@@ -16,11 +16,11 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
|
|
16
16
|
datamule/sec/rss/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
datamule/sec/rss/monitor.py,sha256=6r4EYaSlGu6VYErlj9zXJsIMLVie1cfacSZU-ESfuBI,18231
|
18
18
|
datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
-
datamule/sec/submissions/downloader.py,sha256=
|
20
|
-
datamule/sec/submissions/eftsquery.py,sha256=
|
21
|
-
datamule/sec/submissions/monitor.py,sha256=
|
22
|
-
datamule/sec/submissions/streamer.py,sha256=
|
23
|
-
datamule/sec/submissions/textsearch.py,sha256
|
19
|
+
datamule/sec/submissions/downloader.py,sha256=IB08W8-lQD5Bb0LgzrTN4Xi4HsCw24DybRLHqE1AUrU,3290
|
20
|
+
datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
|
21
|
+
datamule/sec/submissions/monitor.py,sha256=F24I9yn1k8ggbCJQ-Vk7go_qJHlpkBzVKFYKDs_CWLs,5287
|
22
|
+
datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
|
23
|
+
datamule/sec/submissions/textsearch.py,sha256=-a5yIrrxxtaK10IJeywFmXuJmSndYL9VKm4SC4I9JAs,5808
|
24
24
|
datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
25
|
datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
|
26
26
|
datamule/sec/xbrl/filter_xbrl.py,sha256=g9OT4zrNS0tiUJeBIwbCs_zMisOBkpFnMR3tV4Tr39Q,1316
|
@@ -29,7 +29,7 @@ datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTq
|
|
29
29
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
30
|
datamule/seclibrary/downloader.py,sha256=Zb1TxsIz887tO3MJVP66siYVtNus89ti-g9oZ6VywrM,11500
|
31
31
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
32
|
-
datamule-1.1.
|
33
|
-
datamule-1.1.
|
34
|
-
datamule-1.1.
|
35
|
-
datamule-1.1.
|
32
|
+
datamule-1.1.7.dist-info/METADATA,sha256=gIryya087eiyvgFA5S5vf2s_wKDxaV3ZEAJA7-W4kS8,512
|
33
|
+
datamule-1.1.7.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
34
|
+
datamule-1.1.7.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
35
|
+
datamule-1.1.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|