datamule 1.2.5__py3-none-any.whl → 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,7 +18,12 @@ from .mappings.thirteenfhr import *
18
18
  from .mappings.twentyfivense import *
19
19
  from .mappings.twentyfourf2nt import *
20
20
  from .mappings.information_table import *
21
+ from .mappings.submission_metadata import *
22
+ from .mappings.ex102_abs import *
23
+ from .mappings.d import *
21
24
 
25
+ from pathlib import Path
26
+ import csv
22
27
  # need to check if mappings correctly create new columns
23
28
  class Table():
24
29
  def __init__(self, data, type,accession):
@@ -27,11 +32,18 @@ class Table():
27
32
  self.type = type
28
33
  self.data = data
29
34
  self.accession = accession
30
- self.columns = self.determine_columns()
35
+ self.columns = self.determine_columns_complete()
36
+
37
+ def determine_columns_complete(self):
38
+ if not self.data:
39
+ return []
40
+ return list(set().union(*(row.keys() for row in self.data)))
41
+
31
42
 
32
43
  def determine_columns(self):
33
44
  if len(self.data) == 0:
34
45
  return []
46
+
35
47
  return self.data[0].keys()
36
48
 
37
49
  def add_column(self,column_name,value):
@@ -190,6 +202,17 @@ class Table():
190
202
  elif self.type == 'signature_schedule_13':
191
203
  mapping_dict = signature_schedule_13_dict
192
204
 
205
+ # D
206
+ elif self.type == 'issuer_list_d':
207
+ mapping_dict = issuer_list_d_dict
208
+ elif self.type == 'metadata_d':
209
+ mapping_dict = metadata_d_dict
210
+ elif self.type == 'offering_data_d':
211
+ mapping_dict = offering_data_d_dict
212
+ elif self.type == 'primary_issuer_d':
213
+ mapping_dict = primary_issuer_d_dict
214
+ elif self.type == 'related_persons_list_d':
215
+ mapping_dict = related_persons_d_dict
193
216
  # SDR
194
217
  elif self.type == 'sdr':
195
218
  mapping_dict = sdr_dict
@@ -227,6 +250,15 @@ class Table():
227
250
  mapping_dict = item_9_24f2nt_dict
228
251
  elif self.type == 'signature_info_schedule_a':
229
252
  mapping_dict = signature_24f2nt_dict
253
+ # ABS
254
+ elif self.type == 'assets_ex102_absee':
255
+ mapping_dict = assets_dict_ex102_abs
256
+ elif self.type =='properties_ex102_absee':
257
+ mapping_dict = properties_dict_ex102_abs
258
+ # submission metadata
259
+ elif self.type == 'document_submission_metadata':
260
+ mapping_dict = document_submission_metadata_dict
261
+
230
262
 
231
263
  else:
232
264
  mapping_dict = {}
@@ -245,9 +277,6 @@ class Table():
245
277
  for old_key, new_key in mapping_dict.items():
246
278
  if old_key in row:
247
279
  ordered_row[new_key] = row.pop(old_key)
248
- else:
249
- # if the old key is not present, set the new key to None
250
- ordered_row[new_key] = None
251
280
 
252
281
  # Then add any remaining keys that weren't in the mapping
253
282
  for key, value in row.items():
@@ -257,4 +286,30 @@ class Table():
257
286
  row.clear()
258
287
  row.update(ordered_row)
259
288
 
260
- self.determine_columns()
289
+ # Update the columns after mapping
290
+ columns = set(self.columns)
291
+ # remove the old columns that are now in the mapping
292
+ columns.difference_update(mapping_dict.keys())
293
+ # add the new columns from the mapping
294
+ columns.update(mapping_dict.values())
295
+ # add the accession column to the columns
296
+ columns.add('accession')
297
+
298
+ self.columns = list(columns)
299
+
300
+ def write_csv(self, output_file):
301
+ output_file = Path(output_file)
302
+ fieldnames = self.columns
303
+
304
+ # Check if the file already exists
305
+ if output_file.exists():
306
+ # Append to existing file without writing header
307
+ with open(output_file, 'a', newline='') as csvfile:
308
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
309
+ writer.writerows(self.data)
310
+ else:
311
+ # Create new file with header
312
+ with open(output_file, 'w', newline='') as csvfile:
313
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
314
+ writer.writeheader()
315
+ writer.writerows(self.data)
datamule/helper.py CHANGED
@@ -79,7 +79,16 @@ def _process_cik_and_metadata_filters(cik=None, ticker=None, **kwargs):
79
79
 
80
80
  # Convert ticker to CIK if provided
81
81
  if ticker is not None:
82
- cik = get_cik_from_dataset('listed_filer_metadata', 'ticker', ticker)
82
+ if isinstance(ticker, str):
83
+ ticker = [ticker]
84
+
85
+ ciks_from_ticker = []
86
+ for t in ticker:
87
+ ciks = get_cik_from_dataset('listed_filer_metadata', 'ticker', t)
88
+ if ciks:
89
+ ciks_from_ticker.extend(ciks)
90
+
91
+ cik = ciks
83
92
 
84
93
  # Normalize CIK format
85
94
  if cik is not None:
datamule/index.py CHANGED
@@ -1,16 +1,16 @@
1
- from pathlib import Path
1
+
2
2
  from .sec.submissions.textsearch import query
3
- from .helper import _process_cik_and_metadata_filters, load_package_dataset
3
+ from .helper import _process_cik_and_metadata_filters
4
+ from pathlib import Path
4
5
 
5
6
  class Index:
6
- def __init__(self, path=None):
7
- self.path = Path(path) if path else None
7
+ def __init__(self):
8
+ pass
8
9
 
9
10
  def search_submissions(
10
11
  self,
11
12
  text_query,
12
- start_date=None,
13
- end_date=None,
13
+ filing_date=None,
14
14
  submission_type=None,
15
15
  cik=None,
16
16
  ticker=None,
@@ -47,16 +47,14 @@ class Index:
47
47
  # Execute the search query
48
48
  results = query(
49
49
  f'{text_query}',
50
- filing_date=(start_date, end_date),
50
+ filing_date=filing_date,
51
51
  requests_per_second=requests_per_second,
52
52
  quiet=quiet,
53
53
  submission_type=submission_type,
54
54
  **kwargs
55
55
  )
56
56
 
57
- # Save results to path if specified
58
- if self.path:
59
- self._save_results(results, text_query)
57
+
60
58
 
61
59
  return results
62
60
 
datamule/portfolio.py CHANGED
@@ -9,22 +9,28 @@ import os
9
9
  from .helper import _process_cik_and_metadata_filters
10
10
  from .seclibrary.downloader import download as seclibrary_download
11
11
  from .sec.xbrl.filter_xbrl import filter_xbrl
12
- from .sec.submissions.monitor import monitor
13
- from .sec.xbrl.xbrlmonitor import XBRLMonitor
12
+ from .sec.submissions.monitor import Monitor
13
+ #from .sec.xbrl.xbrlmonitor import XBRLMonitor
14
14
 
15
15
 
16
16
  class Portfolio:
17
17
  def __init__(self, path):
18
18
  self.path = Path(path)
19
+ self.api_key = None
19
20
  self.submissions = []
20
21
  self.submissions_loaded = False
21
22
  self.MAX_WORKERS = os.cpu_count() - 1
23
+
24
+ self.monitor = Monitor()
22
25
 
23
26
  if self.path.exists():
24
27
  self._load_submissions()
25
28
  self.submissions_loaded = True
26
29
  else:
27
30
  self.path.mkdir(parents=True, exist_ok=True)
31
+
32
+ def set_api_key(self, api_key):
33
+ self.api_key = api_key
28
34
 
29
35
  def _load_submissions(self):
30
36
  folders = [f for f in self.path.iterdir() if f.is_dir()]
@@ -132,6 +138,7 @@ class Portfolio:
132
138
  seclibrary_download(
133
139
  output_dir=self.path,
134
140
  cik=cik,
141
+ api_key=self.api_key,
135
142
  submission_type=submission_type,
136
143
  filing_date=filing_date,
137
144
  accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
@@ -149,20 +156,18 @@ class Portfolio:
149
156
  )
150
157
 
151
158
  self.submissions_loaded = False
152
- def monitor_submissions(self,data_callback=None, poll_callback=None, submission_type=None, cik=None,
153
- polling_interval=200, requests_per_second=5, quiet=False, start_date=None, ticker=None, **kwargs):
159
+ def monitor_submissions(self, data_callback=None, interval_callback=None,
160
+ polling_interval=1000, quiet=True, start_date=None,
161
+ validation_interval=600000):
154
162
 
155
- cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
156
163
 
157
- monitor(
164
+ self.monitor.monitor_submissions(
158
165
  data_callback=data_callback,
159
- poll_callback=poll_callback,
160
- cik=cik,
161
- submission_type=submission_type,
166
+ interval_callback=interval_callback,
162
167
  polling_interval=polling_interval,
163
- requests_per_second=requests_per_second,
164
168
  quiet=quiet,
165
- start_date=start_date
169
+ start_date=start_date,
170
+ validation_interval=validation_interval
166
171
  )
167
172
 
168
173
 
@@ -1,130 +1,183 @@
1
+ import time
2
+ from collections import deque
3
+ from datetime import datetime
4
+ import xml.etree.ElementTree as ET
5
+ import re
1
6
  import asyncio
2
- from datetime import datetime, timedelta
3
- from .eftsquery import EFTSQuery # Import the class directly instead of the function
4
- from ..rss.monitor import start_monitor # Import start_monitor directly
5
- import pytz
7
+ from ..utils import headers, PreciseRateLimiter
8
+ from .eftsquery import EFTSQuery
9
+ import aiohttp
6
10
 
7
11
 
8
- async def _process_efts_hits(hits, collected_accession_numbers, data_callback=None,rate_limiter=None):
9
- """Process EFTS hits, collect accession numbers, and call data callback."""
10
- processed_hits = []
12
+ async def poll_rss(limiter):
13
+ base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
11
14
 
12
- for hit in hits:
13
- try:
14
- source = hit.get('_source', {})
15
-
16
- # Extract key fields
17
- accession_number = source.get('adsh')
18
-
19
- # Extract submission_type (form) and ciks
20
- submission_type = source.get('form')
21
- ciks = source.get('ciks', [])
22
- ciks = [str(int(cik)) for cik in ciks]
23
-
24
- filing_date = source.get('file_date')
25
-
26
- # Create standardized filing record
27
- filing = {
28
- 'accession_number': accession_number,
29
- 'submission_type': submission_type,
30
- 'ciks': ciks,
31
- 'filing_date': filing_date,
32
-
33
- }
34
-
35
- processed_hits.append(filing)
36
- collected_accession_numbers.add(accession_number) # Changed append to add for set operation
37
-
38
- except Exception as e:
39
- print(f"Error processing EFTS hit: {e}")
15
+ # Create a session specifically for this RSS polling operation
16
+ async with aiohttp.ClientSession(headers=headers) as session:
17
+ # Use the rate limiter before making the request
18
+ async with limiter:
19
+ # Make the HTTP request with the session
20
+ async with session.get(base_url) as response:
21
+ content = await response.read()
40
22
 
41
- # Call data callback if provided
42
- if data_callback and processed_hits:
43
- await data_callback(processed_hits, rate_limiter)
23
+ # Process the content
24
+ content_str = content.decode('utf-8')
25
+ root = ET.fromstring(content_str)
26
+ namespace = {'atom': 'http://www.w3.org/2005/Atom'}
27
+ entries = root.findall('atom:entry', namespace)
28
+ grouped = {}
29
+
30
+ for entry in entries:
31
+ url = entry.find('atom:link', namespace).get('href')
32
+ accession = re.search(r'/(\d{10})-(\d{2})-(\d{6})', url)
33
+ accession = accession.group(1) + accession.group(2) + accession.group(3)
34
+ cik = re.search(r'/data/(\d+)/', url).group(1)
44
35
 
45
- return processed_hits
46
-
47
- async def _master_monitor_impl(data_callback=None, poll_callback=None, submission_type=None, cik=None,
48
- polling_interval=200, requests_per_second=2.0, quiet=True, start_date=None):
49
- """Implementation of the master monitor."""
50
- # Set default start date to today if not provided (eastern)
51
- eastern_tz = pytz.timezone('US/Eastern')
52
- current_date = datetime.now(eastern_tz).strftime('%Y-%m-%d')
53
- if not start_date:
54
- start_date = current_date
36
+ if accession not in grouped:
37
+ grouped[accession] = {'submission_type': '', 'ciks': set(), 'filing_date': ''}
55
38
 
56
- # Changed from list to set for more efficient lookups
57
- collected_accession_numbers = set()
58
-
59
- if not quiet:
60
- print(f"Starting SEC monitoring from {start_date}")
61
-
62
- # Step 1: Query EFTS for all filings from start_date up to current date
63
- if not quiet:
64
- print(f"Fetching filings from {start_date} to {current_date}...")
65
-
66
- # Prepare a wrapper callback to collect accession numbers
67
- async def process_callback(hits):
68
- await _process_efts_hits(hits, collected_accession_numbers, data_callback, efts_query.limiter)
69
-
70
- # Create an EFTSQuery instance
71
- efts_query = EFTSQuery(requests_per_second=requests_per_second)
39
+ grouped[accession]['ciks'].add(cik)
40
+ grouped[accession]['submission_type'] = entry.find('atom:category', namespace).get('term')
41
+ summary_text = entry.find('atom:summary', namespace).text
42
+ filing_date_match = re.search(r'Filed:</b>\s*(\d{4}-\d{2}-\d{2})', summary_text)
43
+ if filing_date_match:
44
+ grouped[accession]['filing_date'] = filing_date_match.group(1)
45
+
46
+ results = [{'accession': int(k.replace('-', '')), 'submission_type': v['submission_type'], 'ciks': list(v['ciks']), 'filing_date': v['filing_date']} for k, v in grouped.items()]
47
+ return results
48
+
49
+ def clean_efts_hits(hits):
50
+ # clean hits
51
+ hits = [{'accession': int(hit['_source']['adsh'].replace('-','')), 'filing_date': hit['_source']['file_date'], 'ciks': hit['_source']['ciks']} for hit in hits]
52
+ return hits
53
+
54
+ class Monitor():
55
+ def __init__(self):
56
+ self.accessions = deque(maxlen=50000)
57
+ self.ratelimiters = {'sec.gov': PreciseRateLimiter(rate=5)}
58
+ self.efts_query = EFTSQuery(quiet=True)
59
+ self.efts_query.limiter = self.ratelimiters['sec.gov']
60
+
61
+ def set_domain_rate_limit(self, domain, rate):
62
+ self.ratelimiters[domain] = PreciseRateLimiter(rate=rate)
63
+ if domain == 'sec.gov':
64
+ self.efts_query.limiter = self.ratelimiters[domain]
72
65
 
73
- # Run EFTS query for the date range
74
- async with efts_query:
75
- await efts_query.query(
76
- cik=cik,
77
- submission_type=submission_type,
78
- filing_date=(start_date, current_date),
79
- callback=process_callback
66
+ async def _async_run_efts_query(self, **kwargs):
67
+ """Async helper method to run EFTS query without creating a new event loop"""
68
+ # Make sure to set quiet parameter if provided in kwargs
69
+ self.efts_query.quiet = kwargs.get('quiet', True)
70
+ return await self.efts_query.query(
71
+ cik=kwargs.get('cik'),
72
+ submission_type=kwargs.get('submission_type'),
73
+ filing_date=kwargs.get('filing_date'),
74
+ location=kwargs.get('location'),
75
+ callback=kwargs.get('callback'),
76
+ name=kwargs.get('name')
80
77
  )
78
+
79
+ async def _async_monitor_submissions(self, data_callback=None, interval_callback=None,
80
+ polling_interval=1000, quiet=True, start_date=None,
81
+ validation_interval=60000):
82
+ """
83
+ Async implementation of monitor_submissions.
84
+ """
85
+
86
+ # Backfill if start_date is provided
87
+ if start_date is not None:
88
+ today_date = datetime.now().date().strftime('%Y-%m-%d')
89
+ if not quiet:
90
+ print(f"Backfilling from {start_date} to {today_date}")
91
+
92
+ hits = clean_efts_hits(await self._async_run_efts_query(
93
+ filing_date=(start_date, today_date),
94
+ quiet=quiet
95
+ ))
96
+
97
+ new_hits = self._filter_new_accessions(hits)
98
+ if not quiet:
99
+ print(f"New submissions found: {len(new_hits)}")
100
+ if new_hits and data_callback:
101
+ data_callback(new_hits)
102
+
103
+ last_polling_time = time.time()
104
+ last_validation_time = last_polling_time
105
+ current_time = last_polling_time
106
+
107
+ while True:
108
+ # RSS polling
109
+ if not quiet:
110
+ print(f"Polling RSS feed")
111
+ results = await poll_rss(self.ratelimiters['sec.gov'])
112
+ new_results = self._filter_new_accessions(results)
113
+ if new_results:
114
+ if not quiet:
115
+ print(f"Found {len(new_results)} new submissions via RSS")
116
+ if data_callback:
117
+ data_callback(new_results)
118
+
119
+ # EFTS validation
120
+ if validation_interval and (current_time - last_validation_time) >= validation_interval/1000:
121
+ # Get submissions from the last 24 hours for validation
122
+ today_date = datetime.now().strftime('%Y-%m-%d')
123
+ if not quiet:
124
+ print(f"Validating submissions from {today_date}")
125
+
126
+ hits = clean_efts_hits(await self._async_run_efts_query(
127
+ filing_date=(today_date, today_date),
128
+ quiet=quiet
129
+ ))
130
+
131
+ new_hits = self._filter_new_accessions(hits)
132
+ if new_hits:
133
+ if not quiet:
134
+ print(f"Found {len(new_hits)} new submissions via EFTS validation")
135
+ if data_callback:
136
+ data_callback(new_hits)
137
+ last_polling_time = time.time()
138
+ last_validation_time = current_time
139
+
140
+ # Interval callback
141
+ if interval_callback:
142
+ interval_callback()
143
+
144
+ next_poll_time = last_polling_time + (polling_interval / 1000)
145
+ current_time = time.time()
146
+ time_to_sleep = max(0, next_poll_time - current_time)
147
+ await asyncio.sleep(time_to_sleep)
148
+ last_polling_time = next_poll_time
149
+
150
+
151
+ def monitor_submissions(self, data_callback=None, interval_callback=None,
152
+ polling_interval=1000, quiet=True, start_date=None,
153
+ validation_interval=60000):
154
+ """
155
+ Monitor SEC submissions using the EDGAR system.
156
+ :param data_callback: function to call with the data
157
+ :param interval_callback: function that executes between polls
158
+ :param polling_interval: interval between polls in milliseconds
159
+ :param quiet: if True, suppresses output
160
+ :param start_date: backfill start date in YYYY-MM-DD format
161
+ :param validation_interval: interval between validation in milliseconds
162
+
163
+ This function combines the speed of the RSS feed (fast, but misses some submissions) with the accuracy of the EFTS system.
164
+ """
165
+ # This is now a synchronous wrapper around the async implementation
166
+ return asyncio.run(self._async_monitor_submissions(
167
+ data_callback=data_callback,
168
+ interval_callback=interval_callback,
169
+ polling_interval=polling_interval,
170
+ quiet=quiet,
171
+ start_date=start_date,
172
+ validation_interval=validation_interval
173
+ ))
81
174
 
82
- if not quiet:
83
- print(f"Historical query complete. Collected {len(collected_accession_numbers)} accession numbers.")
84
-
85
- # Step 2: Hand off to RSS monitor with collected accession numbers
86
- if not quiet:
87
- print("Starting real-time RSS monitoring...")
88
-
89
- # Start RSS monitor with the set of accession numbers to skip (from EFTS)
90
- # and an empty list for ongoing tracking
91
- await start_monitor(
92
- data_callback=data_callback,
93
- poll_callback=poll_callback,
94
- submission_type=submission_type,
95
- cik=cik,
96
- polling_interval=polling_interval,
97
- requests_per_second=requests_per_second,
98
- quiet=quiet,
99
- known_accession_numbers=[], # Start with an empty list for ongoing tracking
100
- skip_initial_accession_numbers=collected_accession_numbers # Pass the EFTS accession numbers as the skip list
101
- )
102
-
103
- def monitor(data_callback=None, poll_callback=None, submission_type=None, cik=None,
104
- polling_interval=200, requests_per_second=2.0, quiet=True, start_date=None):
105
- """
106
- Monitor SEC filings by combining EFTS historical queries with real-time RSS monitoring.
107
-
108
- Parameters:
109
- data_callback (callable): Async function to call when new filings are found.
110
- Will be called with a list of dicts containing
111
- 'accession_number', 'submission_type', and 'ciks'.
112
- poll_callback (callable): Async function to call during RSS polling wait periods.
113
- submission_type (str or list): Form type(s) to monitor (e.g., "8-K", "10-Q").
114
- cik (str or list): CIK(s) to monitor.
115
- polling_interval (int): Polling interval in milliseconds for RSS monitor.
116
- requests_per_second (float): Maximum requests per second.
117
- quiet (bool): Suppress verbose output.
118
- start_date (str): ISO format date (YYYY-MM-DD) from which to start monitoring.
119
- If None, will start from current date. (EASTERN TIME)
120
- """
121
- return asyncio.run(_master_monitor_impl(
122
- data_callback=data_callback,
123
- poll_callback=poll_callback,
124
- submission_type=submission_type,
125
- cik=cik,
126
- polling_interval=polling_interval,
127
- requests_per_second=requests_per_second,
128
- quiet=quiet,
129
- start_date=start_date
130
- ))
175
+ def _filter_new_accessions(self, items):
176
+ """Filter items to only include those with new accession numbers."""
177
+ new_items = []
178
+ for item in items:
179
+ accession = item['accession']
180
+ if accession not in self.accessions:
181
+ self.accessions.append(accession)
182
+ new_items.append(item)
183
+ return new_items
@@ -1,8 +1,4 @@
1
1
  import asyncio
2
- import aiohttp
3
- from datetime import datetime
4
- from urllib.parse import urlencode
5
- from tqdm import tqdm
6
2
  from .eftsquery import EFTSQuery
7
3
 
8
4
  class TextSearchEFTSQuery(EFTSQuery):
@@ -2,7 +2,7 @@ import asyncio
2
2
  import aiohttp
3
3
  import json
4
4
  from tqdm import tqdm
5
- from ..utils import PreciseRateLimiter, RateMonitor, RetryException, headers
5
+ from ..utils import PreciseRateLimiter, RateMonitor, headers
6
6
 
7
7
  async def fetch_company_facts(session, cik, rate_limiter, rate_monitor, pbar):
8
8
  # Format CIK with leading zeros to 10 digits
@@ -1,7 +1,6 @@
1
1
  import os
2
2
  import asyncio
3
3
  import aiohttp
4
- from pathlib import Path
5
4
  from tqdm import tqdm
6
5
  import time
7
6
  import shutil
@@ -13,11 +12,12 @@ from concurrent.futures import ThreadPoolExecutor
13
12
  from functools import partial
14
13
  from queue import Queue, Empty
15
14
  from threading import Thread
16
- from secsgml import parse_sgml_submission
17
15
  from .query import query
18
16
  from os import cpu_count
19
17
  from ..submission import Submission
20
18
 
19
+
20
+
21
21
  class Downloader:
22
22
  def __init__(self, api_key=None):
23
23
  self.BASE_URL = "https://library.datamule.xyz/original/nc/"