datamule 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,260 @@
1
+ from .mappings.atsn import *
2
+ from .mappings.cfportal import *
3
+ from .mappings.ex99a_sdr import *
4
+ from .mappings.ex99c_sdr import *
5
+ from .mappings.ex99g_sdr import *
6
+ from .mappings.ex99i_sdr import *
7
+ from .mappings.nmfp import *
8
+ from .mappings.npx import *
9
+ from .mappings.onefourtyfour import *
10
+ from .mappings.ownership import *
11
+ from .mappings.proxy_voting_record import *
12
+ from .mappings.sbs import *
13
+ from .mappings.sbsef import *
14
+ from .mappings.schedule13 import *
15
+ from .mappings.sdr import *
16
+ from .mappings.ta import *
17
+ from .mappings.thirteenfhr import *
18
+ from .mappings.twentyfivense import *
19
+ from .mappings.twentyfourf2nt import *
20
+ from .mappings.information_table import *
21
+
22
+ # need to check if mappings correctly create new columns
23
+ class Table():
24
+ def __init__(self, data, type,accession):
25
+ if isinstance(data,dict):
26
+ data = [data]
27
+ self.type = type
28
+ self.data = data
29
+ self.accession = accession
30
+ self.columns = self.determine_columns()
31
+
32
+ def determine_columns(self):
33
+ if len(self.data) == 0:
34
+ return []
35
+ return self.data[0].keys()
36
+
37
+ def add_column(self,column_name,value):
38
+ for row in self.data:
39
+ row[column_name] = value
40
+
41
+ def map_data(self):
42
+ # Add the accession column to all rows first, ensuring it will be first
43
+ self.add_column('accession', self.accession)
44
+
45
+
46
+ # ATS-N, types: metadata_ats,cover_ats,part_one_ats,part_two_ats,part_three_ats,part_four_ats
47
+ if self.type == 'metadata_ats':
48
+ mapping_dict = metadata_ats_dict
49
+ elif self.type == 'cover_ats':
50
+ mapping_dict = cover_ats_dict
51
+ elif self.type == 'part_one_ats':
52
+ mapping_dict = part_one_ats_dict
53
+ elif self.type == 'part_two_ats':
54
+ mapping_dict = part_two_ats_dict
55
+ elif self.type == 'part_three_ats':
56
+ mapping_dict = part_three_ats_dict
57
+ elif self.type == 'part_four_ats':
58
+ mapping_dict = part_four_ats_dict
59
+ # CFPORTAL
60
+ elif self.type == 'metadata_cfportal':
61
+ mapping_dict = metadata_cfportal_dict
62
+ elif self.type == 'identifying_information_cfportal':
63
+ mapping_dict = identifying_information_cfportal_dict
64
+ elif self.type == 'form_of_organization_cfportal':
65
+ mapping_dict = form_of_organization_cfportal_dict
66
+ elif self.type == 'successions_cfportal':
67
+ mapping_dict = successions_cfportal_dict
68
+ elif self.type == 'control_relationships_cfportal':
69
+ mapping_dict = control_relationships_cfportal_dict
70
+ elif self.type == 'disclosure_answers_cfportal':
71
+ mapping_dict = disclosure_answers_cfportal_dict
72
+ elif self.type == 'non_securities_related_business_cfportal':
73
+ mapping_dict = non_securities_related_business_cfportal_dict
74
+ elif self.type == 'escrow_arrangements_cfportal':
75
+ mapping_dict = escrow_arrangements_cfportal_dict
76
+ elif self.type == 'execution_cfportal':
77
+ mapping_dict = execution_cfportal_dict
78
+ elif self.type == 'schedule_a_cfportal':
79
+ mapping_dict = schedule_a_cfportal_dict
80
+ elif self.type == 'schedule_b_cfportal':
81
+ mapping_dict = schedule_b_cfportal_dict
82
+ elif self.type == 'schedule_c_cfportal':
83
+ mapping_dict = schedule_c_cfportal_dict
84
+ elif self.type == 'schedule_d_cfportal':
85
+ mapping_dict = schedule_d_cfportal_dict
86
+ elif self.type == 'criminal_drip_info_cfportal':
87
+ mapping_dict = criminal_drip_info_cfportal_dict
88
+ elif self.type == 'regulatory_drip_info_cfportal':
89
+ mapping_dict = regulatory_drip_info_cfportal_dict
90
+ elif self.type == 'civil_judicial_drip_info_cfportal':
91
+ mapping_dict = civil_judicial_drip_info_cfportal_dict
92
+ elif self.type == 'bankruptcy_sipc_drip_info_cfportal':
93
+ mapping_dict = bankruptcy_sipc_drip_info_cfportal_dict
94
+ elif self.type == 'bond_drip_info_cfportal':
95
+ mapping_dict = bond_drip_info_cfportal_dict
96
+ elif self.type == 'judgement_drip_info_cfportal':
97
+ mapping_dict = judgement_drip_info_cfportal_dict
98
+
99
+ # SDR
100
+
101
+ # Information Table
102
+ elif self.type == 'information_table':
103
+ mapping_dict = information_table_dict
104
+
105
+ # NFMP
106
+ elif self.type == 'metadata_nmfp':
107
+ mapping_dict = metadata_nmfp_dict
108
+ elif self.type == 'general_information_nmfp':
109
+ mapping_dict = general_information_nmfp_dict
110
+ elif self.type == 'series_level_info_nmfp':
111
+ mapping_dict = series_level_info_nmfp_dict
112
+ elif self.type == 'class_level_info_nmfp':
113
+ mapping_dict = class_level_info_nmfp_dict
114
+ elif self.type == 'schedule_of_portfolio_securities_info_nmfp':
115
+ mapping_dict = schedule_of_portfolio_securities_info_nmfp_dict
116
+ elif self.type == 'signature_nmfp':
117
+ mapping_dict = signature_nmfp_dict
118
+
119
+ # NPX
120
+ elif self.type == 'npx':
121
+ mapping_dict = npx_dict
122
+
123
+ # 144
124
+ elif self.type == 'signatures_144':
125
+ mapping_dict = signatures_144_dict
126
+ elif self.type == 'securities_sold_in_past_3_months_144':
127
+ mapping_dict = securities_sold_in_past_3_months_144_dict
128
+ elif self.type == 'securities_to_be_sold_144':
129
+ mapping_dict = securities_to_be_sold_144_dict
130
+ elif self.type == 'securities_information_144':
131
+ mapping_dict = securities_information_144_dict
132
+ elif self.type == 'issuer_information_144':
133
+ mapping_dict = issuer_information_144_dict
134
+ elif self.type == 'metadata_144':
135
+ mapping_dict = metadata_144_dict
136
+
137
+ # Ownership
138
+ elif self.type == 'non_derivative_holding_ownership':
139
+ mapping_dict = non_derivative_holding_ownership_dict
140
+ elif self.type == 'non_derivative_transaction_ownership':
141
+ mapping_dict = non_derivative_transaction_ownership_dict
142
+ elif self.type == 'derivative_transaction_ownership':
143
+ mapping_dict = derivative_transaction_ownership_dict
144
+ elif self.type == 'derivative_holding_ownership':
145
+ mapping_dict = derivative_holding_ownership_dict
146
+ elif self.type == 'reporting_owner_ownership':
147
+ mapping_dict = reporting_owner_ownership_dict
148
+ elif self.type == 'metadata_ownership':
149
+ mapping_dict = metadata_ownership_dict
150
+ elif self.type == 'owner_signature_ownership':
151
+ mapping_dict = owner_signature_ownership_dict
152
+
153
+ # Proxy Voting Record
154
+ elif self.type == 'proxy_voting_record':
155
+ mapping_dict = proxy_voting_record_dict
156
+
157
+ # SBS
158
+
159
+ # SBSEF
160
+ elif self.type == 'sbsef':
161
+ mapping_dict = sbsef_dict
162
+
163
+ # Schedule 13
164
+ elif self.type == 'metadata_schedule_13':
165
+ mapping_dict = metadata_schedule_13_dict
166
+ elif self.type == 'cover_schedule_13':
167
+ mapping_dict = cover_schedule_13_dict
168
+ elif self.type == 'reporting_person_details_schedule_13':
169
+ mapping_dict = reporting_person_details_schedule_13_dict
170
+ elif self.type == 'item_1_schedule_13':
171
+ mapping_dict = item_1_schedule_13_dict
172
+ elif self.type == 'item_2_schedule_13':
173
+ mapping_dict = item_2_schedule_13_dict
174
+ elif self.type == 'item_3_schedule_13':
175
+ mapping_dict = item_3_schedule_13_dict
176
+ elif self.type == 'item_4_schedule_13':
177
+ mapping_dict = item_4_schedule_13_dict
178
+ elif self.type == 'item_5_schedule_13':
179
+ mapping_dict = item_5_schedule_13_dict
180
+ elif self.type == 'item_6_schedule_13':
181
+ mapping_dict = item_6_schedule_13_dict
182
+ elif self.type == 'item_7_schedule_13':
183
+ mapping_dict = item_7_schedule_13_dict
184
+ elif self.type == 'item_8_schedule_13':
185
+ mapping_dict = item_8_schedule_13_dict
186
+ elif self.type == 'item_9_schedule_13':
187
+ mapping_dict = item_9_schedule_13_dict
188
+ elif self.type == 'item_10_schedule_13':
189
+ mapping_dict = item_10_schedule_13_dict
190
+ elif self.type == 'signature_schedule_13':
191
+ mapping_dict = signature_schedule_13_dict
192
+
193
+ # SDR
194
+ elif self.type == 'sdr':
195
+ mapping_dict = sdr_dict
196
+
197
+ # TA
198
+
199
+ # 13F-HR
200
+ elif self.type == '13fhr':
201
+ mapping_dict = thirteenfhr_dict
202
+
203
+ # 25-NSE
204
+ elif self.type == '25nse':
205
+ mapping_dict = twentyfive_nse_dict
206
+
207
+ # 24F-2NT
208
+ elif self.type == 'metadata_24f_2nt':
209
+ mapping_dict = metadata_24f_2nt_dict
210
+ elif self.type == 'item_1_24f2nt':
211
+ mapping_dict = item_1_24f2nt_dict
212
+ elif self.type == 'item_2_24f2nt':
213
+ mapping_dict = item_2_24f2nt_dict
214
+ elif self.type == 'item_3_24f2nt':
215
+ mapping_dict = item_3_24f2nt_dict
216
+ elif self.type == 'item_4_24f2nt':
217
+ mapping_dict = item_4_24f2nt_dict
218
+ elif self.type == 'item_5_24f2nt':
219
+ mapping_dict = item_5_24f2nt_dict
220
+ elif self.type == 'item_6_24f2nt':
221
+ mapping_dict = item_6_24f2nt_dict
222
+ elif self.type == 'item_7_24f2nt':
223
+ mapping_dict = item_7_24f2nt_dict
224
+ elif self.type == 'item_8_24f2nt':
225
+ mapping_dict = item_8_24f2nt_dict
226
+ elif self.type == 'item_9_24f2nt':
227
+ mapping_dict = item_9_24f2nt_dict
228
+ elif self.type == 'signature_info_schedule_a':
229
+ mapping_dict = signature_24f2nt_dict
230
+
231
+ else:
232
+ mapping_dict = {}
233
+
234
+ # Update mapping dictionary to include accession at the beginning
235
+ # Create a new mapping with accession as the first key
236
+ new_mapping = {'accession': 'accession'}
237
+ # Add the rest of the mapping
238
+ new_mapping.update(mapping_dict)
239
+ mapping_dict = new_mapping
240
+
241
+ # apply the mapping to the data
242
+ for row in self.data:
243
+ ordered_row = {}
244
+ # First add all keys from the mapping dict in order
245
+ for old_key, new_key in mapping_dict.items():
246
+ if old_key in row:
247
+ ordered_row[new_key] = row.pop(old_key)
248
+ else:
249
+ # if the old key is not present, set the new key to None
250
+ ordered_row[new_key] = None
251
+
252
+ # Then add any remaining keys that weren't in the mapping
253
+ for key, value in row.items():
254
+ ordered_row[key] = value
255
+
256
+ # Replace the original row with the ordered row
257
+ row.clear()
258
+ row.update(ordered_row)
259
+
260
+ self.determine_columns()
@@ -0,0 +1,31 @@
1
+
2
+ from pathlib import Path
3
+ import urllib.request
4
+ import gzip
5
+ import shutil
6
+ import os
7
+
8
+ class PackageUpdater():
9
+ def __init__(self):
10
+ pass
11
+
12
+ def update_package_data():
13
+ # Create data directory in user's home
14
+ data_dir = Path.home() / ".datamule"
15
+ data_dir.mkdir(exist_ok=True)
16
+
17
+ # Download data file
18
+ file_url = "https://github.com/john-friedman/datamule-data/raw/master/data/filer_metadata/listed_filer_metadata.csv.gz"
19
+ file_path = data_dir / "listed_filer_metadata.csv"
20
+ temp_gz_path = data_dir / "listed_filer_metadata.csv.gz"
21
+
22
+ if not file_path.exists():
23
+ print(f"Downloading data to {data_dir}")
24
+ urllib.request.urlretrieve(file_url, temp_gz_path)
25
+
26
+ with gzip.open(temp_gz_path, 'rb') as f_in:
27
+ with open(file_path, 'wb') as f_out:
28
+ shutil.copyfileobj(f_in, f_out)
29
+
30
+ os.remove(temp_gz_path)
31
+ print(f"Data downloaded to {file_path}")
datamule/portfolio.py CHANGED
@@ -119,7 +119,7 @@ class Portfolio:
119
119
  # First query, just set the accession numbers
120
120
  self.accession_numbers = new_accession_numbers
121
121
 
122
- def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,requests_per_second=5, **kwargs):
122
+ def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=None,requests_per_second=5, **kwargs):
123
123
  if provider is None:
124
124
  config = Config()
125
125
  provider = config.get_default_source()
@@ -134,7 +134,8 @@ class Portfolio:
134
134
  cik=cik,
135
135
  submission_type=submission_type,
136
136
  filing_date=filing_date,
137
- accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
137
+ accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
138
+ keep_document_types=document_type
138
139
  )
139
140
  else:
140
141
  sec_download(
@@ -143,7 +144,8 @@ class Portfolio:
143
144
  submission_type=submission_type,
144
145
  filing_date=filing_date,
145
146
  requests_per_second=requests_per_second,
146
- accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
147
+ accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
148
+ keep_document_types=document_type
147
149
  )
148
150
 
149
151
  self.submissions_loaded = False
@@ -1,35 +1,19 @@
1
1
  import os
2
2
  import json
3
3
  from .streamer import stream
4
- from secsgml import parse_sgml_submission_into_memory
5
4
  import aiofiles
5
+ from ...submission import Submission
6
6
 
7
- async def download_callback(hit, content, cik, accno, url, output_dir="filings"):
7
+ async def download_callback(hit, content, cik, accno, url, output_dir="filings", keep_document_types=None):
8
8
  """Save downloaded SEC submission to disk."""
9
9
  try:
10
- # Parse the SGML content
11
- metadata, documents = parse_sgml_submission_into_memory(content=content.decode('utf-8', errors='replace'))
10
+ # Create a Submission object directly from the content
11
+ # Note: the content needs to be decoded from bytes to string for the parser
12
+ submission = Submission(sgml_content=content.decode('utf-8', errors='replace'),
13
+ keep_document_types=keep_document_types)
12
14
 
13
- # Create folder structure: output_dir/accno
14
- file_dir = os.path.join(output_dir, str(accno))
15
- os.makedirs(file_dir, exist_ok=True)
16
-
17
- # Save metadata
18
- metadata_path = os.path.join(file_dir, "metadata.json")
19
- async with aiofiles.open(metadata_path, 'w') as f:
20
- await f.write(json.dumps(metadata, indent=4))
21
-
22
- # Save all documents
23
- for idx, _ in enumerate(metadata['documents']):
24
- try:
25
- filename = metadata['documents'][idx]['filename']
26
- except (KeyError, IndexError):
27
- filename = f"{metadata['documents'][idx].get('sequence', idx)}.txt"
28
-
29
- # Use async file writing
30
- doc_path = os.path.join(file_dir, filename)
31
- async with aiofiles.open(doc_path, 'wb') as f:
32
- await f.write(documents[idx])
15
+ # Use the async save method to write the submission to disk
16
+ file_dir = await submission.save_async(output_dir=output_dir)
33
17
 
34
18
  return file_dir
35
19
  except Exception as e:
@@ -37,7 +21,8 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings")
37
21
  return None
38
22
 
39
23
  def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
40
- requests_per_second=5, output_dir="filings", accession_numbers=None, quiet=False):
24
+ requests_per_second=5, output_dir="filings", accession_numbers=None,
25
+ quiet=False, keep_document_types=None):
41
26
  """
42
27
  Download SEC EDGAR filings and extract their documents.
43
28
 
@@ -51,27 +36,19 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
51
36
  - output_dir: Directory to save documents
52
37
  - accession_numbers: Optional list of accession numbers to filter by
53
38
  - quiet: Whether to suppress progress output
39
+ - keep_document_types: Optional list of document types to keep (e.g. ['10-K', 'EX-10.1'])
54
40
 
55
41
  Returns:
56
42
  - List of all document paths processed
57
-
58
- Examples:
59
- # Download filings by CIK
60
- download(cik="1318605", submission_type="10-K")
61
-
62
- # Download filings by company name
63
- download(name="Tesla", submission_type="10-K")
64
-
65
- # Download filings with location filter
66
- download(name="Apple", location="CA", submission_type="10-K")
67
43
  """
68
-
69
44
  # Make sure output directory exists
70
45
  os.makedirs(output_dir, exist_ok=True)
71
46
 
72
47
  # Create a wrapper for the download_callback that includes the output_dir
73
48
  async def callback_wrapper(hit, content, cik, accno, url):
74
- return await download_callback(hit, content, cik, accno, url, output_dir=output_dir)
49
+ return await download_callback(hit, content, cik, accno, url,
50
+ output_dir=output_dir,
51
+ keep_document_types=keep_document_types)
75
52
 
76
53
  # Call the stream function with our callback
77
54
  return stream(
@@ -16,17 +16,35 @@ from threading import Thread
16
16
  from secsgml import parse_sgml_submission
17
17
  from .query import query
18
18
  from os import cpu_count
19
+ from ..submission import Submission
19
20
 
20
21
  class Downloader:
21
22
  def __init__(self, api_key=None):
22
23
  self.BASE_URL = "https://library.datamule.xyz/original/nc/"
23
24
  self.CHUNK_SIZE = 2 * 1024 * 1024
24
- self.MAX_CONCURRENT_DOWNLOADS = 250
25
+ self.MAX_CONCURRENT_DOWNLOADS = 100
25
26
  self.MAX_DECOMPRESSION_WORKERS = cpu_count()
26
27
  self.MAX_PROCESSING_WORKERS = cpu_count()
27
28
  self.QUEUE_SIZE = 10
28
29
  if api_key is not None:
29
30
  self._api_key = api_key
31
+ # Create a shared event loop for async operations
32
+ self.loop = asyncio.new_event_loop()
33
+ # Create a thread to run the event loop
34
+ self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
35
+ self.loop_thread.start()
36
+ # Create a queue for async tasks
37
+ self.async_queue = Queue()
38
+
39
+ def _run_event_loop(self):
40
+ """Run the event loop in a separate thread"""
41
+ asyncio.set_event_loop(self.loop)
42
+ self.loop.run_forever()
43
+
44
+ def _run_coroutine(self, coro):
45
+ """Run a coroutine in the event loop and return its result"""
46
+ future = asyncio.run_coroutine_threadsafe(coro, self.loop)
47
+ return future.result()
30
48
 
31
49
  @property
32
50
  def api_key(self):
@@ -55,7 +73,7 @@ class Downloader:
55
73
  print(f"Failed to log error to {error_file}: {str(e)}")
56
74
 
57
75
  class FileProcessor:
58
- def __init__(self, output_dir, max_workers, queue_size, pbar, downloader):
76
+ def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=None):
59
77
  self.processing_queue = Queue(maxsize=queue_size)
60
78
  self.should_stop = False
61
79
  self.processing_workers = []
@@ -64,6 +82,7 @@ class Downloader:
64
82
  self.batch_size = 50
65
83
  self.pbar = pbar
66
84
  self.downloader = downloader
85
+ self.keep_document_types = keep_document_types
67
86
 
68
87
  def start_processing_workers(self):
69
88
  for _ in range(self.max_workers):
@@ -75,7 +94,9 @@ class Downloader:
75
94
  def _process_file(self, item):
76
95
  filename, content = item
77
96
  try:
78
- parse_sgml_submission(output_dir=self.output_dir, content=content)
97
+ submission = Submission(sgml_content=content, keep_document_types=self.keep_document_types)
98
+ # Use the shared event loop to run save_async
99
+ self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
79
100
  self.pbar.update(1)
80
101
  except Exception as e:
81
102
  accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
@@ -189,11 +210,11 @@ class Downloader:
189
210
  except Exception as e:
190
211
  self._log_error(output_dir, filename, str(e))
191
212
 
192
- async def process_batch(self, urls, output_dir):
213
+ async def process_batch(self, urls, output_dir, keep_document_types=None):
193
214
  os.makedirs(output_dir, exist_ok=True)
194
215
 
195
216
  with tqdm(total=len(urls), desc="Processing files") as pbar:
196
- processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self)
217
+ processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types)
197
218
  processor.start_processing_workers()
198
219
 
199
220
  semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
@@ -216,7 +237,7 @@ class Downloader:
216
237
  processor.stop_workers()
217
238
  decompression_pool.shutdown()
218
239
 
219
- def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None):
240
+ def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
220
241
  """
221
242
  Query SEC filings and download/process them.
222
243
 
@@ -225,6 +246,8 @@ class Downloader:
225
246
  - cik: Company CIK number(s), string, int, or list
226
247
  - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
227
248
  - output_dir: Directory to save downloaded files
249
+ - accession_numbers: List of specific accession numbers to download
250
+ - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
228
251
  """
229
252
  if self.api_key is None:
230
253
  raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
@@ -262,15 +285,32 @@ class Downloader:
262
285
  start_time = time.time()
263
286
 
264
287
  # Process the batch asynchronously
265
- asyncio.run(self.process_batch(urls, output_dir))
288
+ asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types))
266
289
 
267
290
  # Calculate and display performance metrics
268
291
  elapsed_time = time.time() - start_time
269
292
  print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
270
293
  print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
294
+
295
+ def __del__(self):
296
+ """Cleanup when the downloader is garbage collected"""
297
+ if hasattr(self, 'loop') and self.loop.is_running():
298
+ self.loop.call_soon_threadsafe(self.loop.stop)
271
299
 
272
300
 
273
- def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None):
301
+ def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
302
+ """
303
+ Query SEC filings and download/process them.
304
+
305
+ Parameters:
306
+ - submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
307
+ - cik: Company CIK number(s), string, int, or list
308
+ - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
309
+ - api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
310
+ - output_dir: Directory to save downloaded files
311
+ - accession_numbers: List of specific accession numbers to download
312
+ - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
313
+ """
274
314
  if accession_numbers:
275
315
  accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
276
316
  # check if acc no is empty list
@@ -282,5 +322,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
282
322
  cik=cik,
283
323
  filing_date=filing_date,
284
324
  output_dir=output_dir,
285
- accession_numbers=accession_numbers
325
+ accession_numbers=accession_numbers,
326
+ keep_document_types=keep_document_types
286
327
  )