datamule 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,228 @@
1
+ import asyncio
2
+ from urllib.parse import urlencode
3
+ from tqdm import tqdm
4
+ import re
5
+
6
+ from .eftsquery import EFTSQuery
7
+
8
+
9
+ # This is to fix some broken SEC URLS. There's a better way to do this, but this is a quick fix.
10
+ def fix_filing_url(url):
11
+ match_suffix = re.search(r'/(\d{4})\.(.+?)$', url)
12
+ if match_suffix:
13
+ suffix_number = match_suffix.group(1)
14
+ file_ext = match_suffix.group(2)
15
+ match_accession = re.search(r'/(\d{18})/', url)
16
+ if match_accession:
17
+ accession_number = match_accession.group(1)
18
+ formatted_accession_number = f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
19
+ new_url = url.rsplit('/', 1)[0] + f'/{formatted_accession_number}-{suffix_number}.{file_ext}'
20
+ return new_url
21
+ return url
22
+
23
+ class Streamer(EFTSQuery):
24
+ def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None):
25
+ super().__init__(requests_per_second=requests_per_second)
26
+ self.document_callback = document_callback
27
+ self.document_queue = asyncio.Queue()
28
+ self.download_in_progress = asyncio.Event()
29
+ self.query_paused = asyncio.Event()
30
+ self.document_pbar = None
31
+ self.document_workers = []
32
+ self.documents_processed = 0
33
+ self.total_documents = 0
34
+ self.accession_numbers = accession_numbers
35
+ self.skipped_documents = 0
36
+
37
+ async def _fetch_worker(self):
38
+ """Override the parent class worker to implement pause/resume"""
39
+ while True:
40
+ try:
41
+ # Check if we should pause for document downloads
42
+ if self.query_paused.is_set():
43
+ # Wait until downloads are done and we're resumed
44
+ await self.query_paused.wait()
45
+
46
+ params, from_val, size_val, callback = await self.fetch_queue.get()
47
+
48
+ url = f"{self.base_url}?{urlencode(params, doseq=True)}&from={from_val}&size={size_val}"
49
+
50
+ try:
51
+ data = await self._fetch_json(url)
52
+ if 'hits' in data:
53
+ hits = data['hits']['hits']
54
+ if self.pbar:
55
+ self.pbar.update(len(hits))
56
+ if callback:
57
+ await callback(hits)
58
+ self.fetch_queue.task_done()
59
+ except Exception as e:
60
+ print(f"\nError fetching {url}: {str(e)}")
61
+ self.fetch_queue.task_done()
62
+ except asyncio.CancelledError:
63
+ break
64
+ except Exception as e:
65
+ print(f"\nWorker error: {str(e)}")
66
+ self.fetch_queue.task_done()
67
+
68
+ def _construct_submission_url(self, hit):
69
+ """Construct the URL for retrieving the actual submission"""
70
+ try:
71
+ # Extract CIK from the hit
72
+ cik = hit['_source']['ciks'][0]
73
+
74
+ # Extract accession number from _id (format: accno:file.txt)
75
+ accno_w_dash = hit['_id'].split(':')[0]
76
+ accno_no_dash = accno_w_dash.replace('-', '')
77
+
78
+ # Check if we should filter by accession numbers
79
+ if self.accession_numbers is not None and accno_w_dash not in self.accession_numbers:
80
+ return None, None, None
81
+
82
+ # Construct the URL
83
+ url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accno_no_dash}/{accno_w_dash}.txt"
84
+ url = fix_filing_url(url)
85
+
86
+ return url, cik, accno_w_dash
87
+ except (KeyError, IndexError) as e:
88
+ print(f"Error constructing URL for hit: {hit}. Error: {str(e)}")
89
+ return None, None, None
90
+
91
+ async def _document_download_worker(self):
92
+ """Worker to download actual filing documents"""
93
+ while True:
94
+ try:
95
+ hit, doc_url, cik, accno = await self.document_queue.get()
96
+
97
+ try:
98
+ # Use the same rate limiter as the EFTS queries
99
+ async with self.limiter:
100
+ async with self.session.get(doc_url) as response:
101
+ response.raise_for_status()
102
+ content = await response.read()
103
+
104
+ # Update rate monitor
105
+ await self.rate_monitor.add_request(len(content))
106
+
107
+ # Call document callback with content in memory
108
+ if self.document_callback:
109
+ await self.document_callback(hit, content, cik, accno, doc_url)
110
+
111
+ # Update progress bar
112
+ if self.document_pbar:
113
+ self.document_pbar.update(1)
114
+ self.documents_processed += 1
115
+
116
+ self.document_queue.task_done()
117
+ except Exception as e:
118
+ print(f"\nError streaming document {doc_url}: {str(e)}")
119
+ self.document_queue.task_done()
120
+
121
+ except asyncio.CancelledError:
122
+ break
123
+ except Exception as e:
124
+ print(f"\nDocument worker error: {str(e)}")
125
+ self.document_queue.task_done()
126
+
127
+ async def document_download_callback(self, hits):
128
+ """Callback to process EFTS query results and stream submissions"""
129
+ # Pause the EFTS query processing
130
+ self.query_paused.set()
131
+
132
+ # Signal that document download is in progress
133
+ self.download_in_progress.set()
134
+
135
+ # Create progress bar for documents if not exists
136
+ if not self.document_pbar:
137
+ self.document_pbar = tqdm(total=0, desc="Streaming submissions")
138
+
139
+ # Queue up the documents for download
140
+ for hit in hits:
141
+ doc_url, cik, accno = self._construct_submission_url(hit)
142
+ if doc_url:
143
+ # Update document progress bar total
144
+ self.document_pbar.total += 1
145
+ self.total_documents += 1
146
+
147
+ # Add to download queue
148
+ await self.document_queue.put((hit, doc_url, cik, accno))
149
+ elif accno is None and self.accession_numbers is not None:
150
+ # Document was skipped due to accession number filter
151
+ self.skipped_documents += 1
152
+
153
+ # Wait for all documents to be downloaded
154
+ await self.document_queue.join()
155
+
156
+ # Resume EFTS query processing
157
+ self.query_paused.clear()
158
+
159
+ # Signal that document download is complete
160
+ self.download_in_progress.clear()
161
+
162
+ async def stream(self, cik=None, submission_type=None, filing_date=None):
163
+ """Main method to stream EFTS results and download documents"""
164
+ # Create document worker tasks
165
+ self.document_workers = [
166
+ asyncio.create_task(self._document_download_worker())
167
+ for _ in range(5) # Same number as query workers
168
+ ]
169
+
170
+ # Reset counters
171
+ self.documents_processed = 0
172
+ self.total_documents = 0
173
+ self.skipped_documents = 0
174
+
175
+ # Run the main query with our document download callback
176
+ results = await self.query(cik, submission_type, filing_date, self.document_download_callback)
177
+
178
+ # Make sure all document downloads are complete
179
+ if self.download_in_progress.is_set():
180
+ print("Waiting for remaining document downloads to complete...")
181
+ await self.document_queue.join()
182
+
183
+ # Clean up document workers
184
+ for worker in self.document_workers:
185
+ worker.cancel()
186
+
187
+ await asyncio.gather(*self.document_workers, return_exceptions=True)
188
+
189
+ # Close document progress bar and don't show a new one
190
+ if self.document_pbar:
191
+ self.document_pbar.close()
192
+ self.document_pbar = None # Set to None to prevent reuse
193
+
194
+ print(f"\n--- Streaming complete: {len(results)} EFTS results processed ---")
195
+ if self.accession_numbers is not None:
196
+ print(f"--- {self.documents_processed} documents downloaded, {self.skipped_documents} skipped due to accession number filter ---")
197
+ return results
198
+
199
+ def stream(cik=None, submission_type=None, filing_date=None,
200
+ requests_per_second=5.0, document_callback=None, accession_numbers=None):
201
+ """
202
+ Stream EFTS results and download documents into memory.
203
+
204
+ Parameters:
205
+ - cik: CIK number(s) to query for
206
+ - submission_type: Filing type(s) to query for
207
+ - filing_date: Date or date range to query for
208
+ - requests_per_second: Rate limit for SEC requests (combined EFTS and document downloads)
209
+ - document_callback: Callback function that receives (hit, content, cik, accno, url)
210
+ - accession_numbers: Optional list of accession numbers to filter by
211
+
212
+ Returns:
213
+ - List of all EFTS hits processed
214
+ """
215
+
216
+ # check if acc no is empty list
217
+ if accession_numbers == []:
218
+ raise ValueError("Applied filter resulted in empty accession numbers list")
219
+
220
+ async def run_stream():
221
+ streamer = Streamer(
222
+ requests_per_second=requests_per_second,
223
+ document_callback=document_callback,
224
+ accession_numbers=accession_numbers
225
+ )
226
+ return await streamer.stream(cik, submission_type, filing_date)
227
+
228
+ return asyncio.run(run_stream())
@@ -0,0 +1,122 @@
1
+ import asyncio
2
+ import aiohttp
3
+ from datetime import datetime
4
+ from urllib.parse import urlencode
5
+ from tqdm import tqdm
6
+ from .eftsquery import EFTSQuery
7
+
8
+ class TextSearchEFTSQuery(EFTSQuery):
9
+ """
10
+ Extended EFTSQuery class that adds text search capabilities.
11
+ """
12
+ def __init__(self, text_query, requests_per_second=5.0):
13
+ super().__init__(requests_per_second=requests_per_second)
14
+ self.text_query = text_query
15
+
16
+ def _prepare_params(self, cik=None, submission_type=None, filing_date=None):
17
+ # Get base parameters from parent class
18
+ params = super()._prepare_params(cik, submission_type, filing_date)
19
+
20
+ # Add text query parameter
21
+ params['q'] = self.text_query
22
+
23
+ return params
24
+
25
+ async def extract_accession_numbers(hits):
26
+ """
27
+ Extract accession numbers from hits.
28
+
29
+ Parameters:
30
+ -----------
31
+ hits : list
32
+ List of hit objects from the EFTS API.
33
+
34
+ Returns:
35
+ --------
36
+ list
37
+ List of accession numbers extracted from the hits.
38
+ """
39
+ accession_numbers = []
40
+ for hit in hits:
41
+ if '_id' in hit:
42
+ # Extract accession number (part before the colon)
43
+ doc_id = hit['_id']
44
+ if ':' in doc_id:
45
+ acc_no = doc_id.split(':')[0]
46
+ accession_numbers.append(acc_no)
47
+ return accession_numbers
48
+
49
+ def query(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0):
50
+ """
51
+ Search SEC filings for text and return the full search results.
52
+
53
+ Parameters:
54
+ -----------
55
+ text_query : str
56
+ The text to search for in filings. To search for an exact phrase, use double quotes.
57
+ Example: 'covid' or '"climate change"'
58
+ cik : str, list, optional
59
+ CIK number(s) to filter by. Will be zero-padded to 10 digits.
60
+ submission_type : str, list, optional
61
+ Filing type(s) to filter by (e.g., '10-K', '10-Q').
62
+ Defaults to '-0' for primary documents only.
63
+ filing_date : str, tuple, list, optional
64
+ Date or date range to filter by. Can be a single date string ('YYYY-MM-DD'),
65
+ a tuple of (start_date, end_date), or a list of dates.
66
+ requests_per_second : float, optional
67
+ Maximum number of requests per second to make to the SEC API.
68
+ Default is 5.0.
69
+
70
+ Returns:
71
+ --------
72
+ list
73
+ Complete search results with all hit data.
74
+ """
75
+ async def run_query():
76
+ query = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second)
77
+ return await query.query(cik, submission_type, filing_date)
78
+
79
+ return asyncio.run(run_query())
80
+
81
+ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0):
82
+ """
83
+ Search SEC filings for text and return matching accession numbers.
84
+
85
+ Parameters:
86
+ -----------
87
+ text_query : str
88
+ The text to search for in filings. To search for an exact phrase, use double quotes.
89
+ Example: 'covid' or '"climate change"'
90
+ cik : str, list, optional
91
+ CIK number(s) to filter by. Will be zero-padded to 10 digits.
92
+ submission_type : str, list, optional
93
+ Filing type(s) to filter by (e.g., '10-K', '10-Q').
94
+ Defaults to '-0' for primary documents only.
95
+ filing_date : str, tuple, list, optional
96
+ Date or date range to filter by. Can be a single date string ('YYYY-MM-DD'),
97
+ a tuple of (start_date, end_date), or a list of dates.
98
+ requests_per_second : float, optional
99
+ Maximum number of requests per second to make to the SEC API.
100
+ Default is 5.0.
101
+
102
+ Returns:
103
+ --------
104
+ list
105
+ List of accession numbers (as strings) for filings that match the text query.
106
+ """
107
+ async def run_query():
108
+ query_obj = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second)
109
+
110
+ # Create a collector for accession numbers
111
+ all_acc_nos = []
112
+
113
+ async def collect_acc_nos(hits):
114
+ acc_nos = await extract_accession_numbers(hits)
115
+ all_acc_nos.extend(acc_nos)
116
+
117
+ # Run the query with our callback
118
+ await query_obj.query(cik, submission_type, filing_date, collect_acc_nos)
119
+
120
+ return all_acc_nos
121
+
122
+ return asyncio.run(run_query())
datamule/sec/utils.py ADDED
@@ -0,0 +1,64 @@
1
+ import asyncio
2
+ import time
3
+ from collections import deque
4
+
5
+
6
+ class RetryException(Exception):
7
+ def __init__(self, url, retry_after=601): # SEC Rate limit is typically 10 minutes.
8
+ self.url = url
9
+ self.retry_after = retry_after
10
+
11
+ class PreciseRateLimiter:
12
+ def __init__(self, rate, interval=1.0):
13
+ self.rate = rate # requests per interval
14
+ self.interval = interval # in seconds
15
+ self.token_time = self.interval / self.rate # time per token
16
+ self.last_time = time.time()
17
+ self.lock = asyncio.Lock()
18
+
19
+ async def acquire(self):
20
+ async with self.lock:
21
+ now = time.time()
22
+ wait_time = self.last_time + self.token_time - now
23
+ if wait_time > 0:
24
+ await asyncio.sleep(wait_time)
25
+ self.last_time = time.time()
26
+ return True
27
+
28
+ async def __aenter__(self):
29
+ await self.acquire()
30
+ return self
31
+
32
+ async def __aexit__(self, exc_type, exc, tb):
33
+ pass
34
+
35
+ class RateMonitor:
36
+ def __init__(self, window_size=1.0):
37
+ self.window_size = window_size
38
+ self.requests = deque()
39
+ self._lock = asyncio.Lock()
40
+
41
+ async def add_request(self, size_bytes):
42
+ async with self._lock:
43
+ now = time.time()
44
+ self.requests.append((now, size_bytes))
45
+ while self.requests and self.requests[0][0] < now - self.window_size:
46
+ self.requests.popleft()
47
+
48
+ def get_current_rates(self):
49
+ now = time.time()
50
+ while self.requests and self.requests[0][0] < now - self.window_size:
51
+ self.requests.popleft()
52
+
53
+ if not self.requests:
54
+ return 0, 0
55
+
56
+ request_count = len(self.requests)
57
+ byte_count = sum(size for _, size in self.requests)
58
+
59
+ requests_per_second = request_count / self.window_size
60
+ mb_per_second = (byte_count / 1024 / 1024) / self.window_size
61
+
62
+ return round(requests_per_second, 1), round(mb_per_second, 2)
63
+
64
+ headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}
File without changes
@@ -0,0 +1,83 @@
1
+ import os
2
+ import csv
3
+ from pathlib import Path
4
+ from .streamcompanyfacts import stream_company_facts
5
+
6
+ def process_company_data(data, output_path):
7
+ # Check for errors in data
8
+ if data and 'error' in data:
9
+ print(f"Error processing CIK {data.get('cik')}: {data.get('error')}")
10
+ return False
11
+
12
+ # Define CSV output path
13
+ company_cik = data.get('cik')
14
+ csv_path = output_path / f"{company_cik}.csv"
15
+
16
+ with open(csv_path, 'w', newline='') as csvfile:
17
+ fieldnames = [
18
+ 'cik', 'entity_name', 'namespace', 'concept_name',
19
+ 'end_date', 'value', 'unit', 'accession_number',
20
+ 'fiscal_year', 'fiscal_period', 'form_type',
21
+ 'filed_date', 'frame'
22
+ ]
23
+
24
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
25
+ writer.writeheader()
26
+
27
+ entity_name = data.get('entityName')
28
+
29
+ # Process each namespace (dei, us-gaap, etc.)
30
+ for namespace, concepts in data.get('facts', {}).items():
31
+ # Process each concept in the namespace
32
+ for concept_name, concept_data in concepts.items():
33
+ # Get units data (shares, USD, etc.)
34
+ units = concept_data.get('units', {})
35
+
36
+ # Process each unit type
37
+ for unit_type, values in units.items():
38
+ # Process each value (each filing/period)
39
+ for value_data in values:
40
+ # Create a row for the CSV
41
+ row = {
42
+ 'cik': company_cik,
43
+ 'entity_name': entity_name,
44
+ 'namespace': namespace,
45
+ 'concept_name': concept_name,
46
+ 'end_date': value_data.get('end'),
47
+ 'value': value_data.get('val'),
48
+ 'unit': unit_type,
49
+ 'accession_number': value_data.get('accn'),
50
+ 'fiscal_year': value_data.get('fy'),
51
+ 'fiscal_period': value_data.get('fp'),
52
+ 'form_type': value_data.get('form'),
53
+ 'filed_date': value_data.get('filed'),
54
+ 'frame': value_data.get('frame')
55
+ }
56
+ writer.writerow(row)
57
+
58
+ return True
59
+
60
+ def download_company_facts(cik, output_dir, requests_per_second=5):
61
+ # Create output directory if it doesn't exist
62
+ output_path = Path(output_dir)
63
+ output_path.mkdir(parents=True, exist_ok=True)
64
+
65
+ # Handle both single CIK and list
66
+ if isinstance(cik, list):
67
+ # Define callback to process the data for each CIK
68
+ def callback(data):
69
+ process_company_data(data, output_path)
70
+
71
+ # Process all CIKs in parallel
72
+ results = stream_company_facts(
73
+ cik=cik,
74
+ requests_per_second=requests_per_second,
75
+ callback=callback
76
+ )
77
+
78
+ # Just return since the callback handles the processing
79
+ return True
80
+ else:
81
+ # Single CIK case
82
+ result = stream_company_facts(cik=cik, requests_per_second=requests_per_second)
83
+ return process_company_data(result, output_path)
@@ -0,0 +1,39 @@
1
+ # simple implementation
2
+ import requests
3
+ from ..utils import headers
4
+
5
+ def fetch_frame(taxonomy, concept, unit, period):
6
+ url = f"https://data.sec.gov/api/xbrl/frames/{taxonomy}/{concept}/{unit}/{period}.json"
7
+ response = requests.get(url, headers=headers)
8
+ print(url)
9
+ print(response)
10
+ return response.json()
11
+
12
+
13
+ def filter_xbrl(taxonomy, concept, unit, period, logic, value):
14
+ response_data = fetch_frame(taxonomy, concept, unit, period)
15
+
16
+ if response_data is None:
17
+ raise ValueError("Unable to fetch XBRL data. Incorrect parameters?")
18
+
19
+ # input validation
20
+ value = int(value)
21
+
22
+ # Filter data based on logic and value
23
+ data= response_data['data']
24
+
25
+ if logic == '>':
26
+ return [row['accn'] for row in data if row['val'] > value]
27
+ elif logic == '<':
28
+ return [row['accn'] for row in data if row['val'] < value]
29
+ elif logic == '>=':
30
+ return [row['accn'] for row in data if row['val'] >= value]
31
+ elif logic == '<=':
32
+ return [row['accn'] for row in data if row['val'] <= value]
33
+ elif logic == '==':
34
+ return [row['accn'] for row in data if row['val'] == value]
35
+ elif logic == '!=':
36
+ return [row['accn'] for row in data if row['val'] != value]
37
+ else:
38
+ raise ValueError(f"Invalid logic operator: {logic}")
39
+
@@ -0,0 +1,93 @@
1
+ import asyncio
2
+ import aiohttp
3
+ import json
4
+ from tqdm import tqdm
5
+ from ..utils import PreciseRateLimiter, RateMonitor, RetryException, headers
6
+
7
+ async def fetch_company_facts(session, cik, rate_limiter, rate_monitor, pbar):
8
+ # Format CIK with leading zeros to 10 digits
9
+ formatted_cik = f"CIK{str(cik).zfill(10)}"
10
+ url = f"https://data.sec.gov/api/xbrl/companyfacts/{formatted_cik}.json"
11
+
12
+ try:
13
+ # Acquire rate limit token
14
+ await rate_limiter.acquire()
15
+
16
+ async with session.get(url, headers=headers) as response:
17
+ content_length = int(response.headers.get('Content-Length', 0))
18
+ await rate_monitor.add_request(content_length)
19
+
20
+ # Log current rates
21
+ req_rate, mb_rate = rate_monitor.get_current_rates()
22
+ pbar.set_postfix({"req/s": req_rate, "MB/s": mb_rate})
23
+
24
+ # Handle rate limiting
25
+ if response.status == 429:
26
+ retry_after = int(response.headers.get('Retry-After', 601))
27
+ pbar.set_description(f"Rate limited, retry after {retry_after}s")
28
+ await asyncio.sleep(retry_after)
29
+ pbar.set_description(f"Fetching CIK {cik}")
30
+ return await fetch_company_facts(session, cik, rate_limiter, rate_monitor, pbar)
31
+
32
+ # Handle other errors
33
+ if response.status != 200:
34
+ pbar.update(1)
35
+ return {"error": f"HTTP {response.status}", "cik": cik}
36
+
37
+ data = await response.json()
38
+ pbar.update(1)
39
+ return data
40
+
41
+ except Exception as e:
42
+ pbar.update(1)
43
+ return {"error": str(e), "cik": cik}
44
+
45
+ async def stream_companyfacts(cik=None, requests_per_second=5, callback=None):
46
+ if cik is None:
47
+ return {"error": "No CIK provided. Please specify a CIK."}
48
+
49
+ # Handle both single CIK and list of CIKs
50
+ if not isinstance(cik, list):
51
+ cik_list = [cik]
52
+ else:
53
+ cik_list = cik
54
+
55
+ # Initialize rate limiter and monitor
56
+ rate_limiter = PreciseRateLimiter(rate=requests_per_second)
57
+ rate_monitor = RateMonitor(window_size=10.0)
58
+
59
+ # Create progress bar
60
+ pbar = tqdm(total=len(cik_list), desc="Fetching company facts")
61
+
62
+ results = []
63
+ async with aiohttp.ClientSession() as session:
64
+ # Create tasks for all CIKs
65
+ tasks = [
66
+ fetch_company_facts(session, cik_item, rate_limiter, rate_monitor, pbar)
67
+ for cik_item in cik_list
68
+ ]
69
+
70
+ # Process tasks as they complete
71
+ for completed_task in asyncio.as_completed(tasks):
72
+ data = await completed_task
73
+
74
+ # Call callback if provided
75
+ if callback and not (data and 'error' in data):
76
+ callback(data)
77
+
78
+ results.append(data)
79
+
80
+ pbar.close()
81
+
82
+ # If single CIK was passed, return just that result
83
+ if len(cik_list) == 1:
84
+ return results[0]
85
+
86
+ # Otherwise return all results
87
+ return results
88
+
89
+ def stream_company_facts(cik=None, requests_per_second=5, callback=None):
90
+ loop = asyncio.get_event_loop()
91
+ return loop.run_until_complete(
92
+ stream_companyfacts(cik=cik, requests_per_second=requests_per_second, callback=callback)
93
+ )