datamule 1.1.8__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/seclibrary/bq.py CHANGED
@@ -3,9 +3,6 @@ import requests
3
3
  import json
4
4
 
5
5
  def get_information_table(
6
- # Required parameters
7
- table_type="INFORMATION_TABLE",
8
-
9
6
  # Optional filtering parameters
10
7
  columns=None,
11
8
  name_of_issuer=None,
@@ -37,8 +34,6 @@ def get_information_table(
37
34
 
38
35
  Parameters:
39
36
  -----------
40
- table_type : str
41
- The table to query (default is "INFORMATION_TABLE")
42
37
  columns : List[str], optional
43
38
  Specific columns to return. If None, all columns are returned.
44
39
 
@@ -76,7 +71,7 @@ def get_information_table(
76
71
  raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key parameter")
77
72
 
78
73
  # 2. Build query parameters
79
- params = {'table_type': table_type}
74
+ params = {'table_type': 'INFORMATION_TABLE'}
80
75
 
81
76
  # Add columns parameter if provided
82
77
  if columns:
@@ -138,7 +133,198 @@ def get_information_table(
138
133
  # Exact match
139
134
  params[api_param_name] = value
140
135
 
141
- # 3. Make the API request
136
+ # Call common function to make API request
137
+ return _make_api_request(params, api_key, print_cost, verbose)
138
+
139
+ def get_345(
140
+ # Optional filtering parameters
141
+ columns=None,
142
+ is_derivative=None,
143
+ is_non_derivative=None,
144
+ security_title=None,
145
+ transaction_date=None,
146
+ document_type=None,
147
+ transaction_code=None,
148
+ equity_swap_involved=None,
149
+ transaction_timeliness=None,
150
+ transaction_shares=None,
151
+ transaction_price_per_share=None,
152
+ shares_owned_following_transaction=None,
153
+ ownership_type=None,
154
+ deemed_execution_date=None,
155
+ conversion_or_exercise_price=None,
156
+ exercise_date=None,
157
+ expiration_date=None,
158
+ underlying_security_title=None,
159
+ underlying_security_shares=None,
160
+ underlying_security_value=None,
161
+ accession=None,
162
+ reporting_owner_cik=None,
163
+ issuer_cik=None,
164
+ filing_date=None,
165
+
166
+ # API key handling
167
+ api_key=None,
168
+
169
+ # Additional options
170
+ print_cost=True,
171
+ verbose=False
172
+ ):
173
+ """
174
+ Query the SEC BigQuery API for Form 345 insider transaction data.
175
+
176
+ Parameters:
177
+ -----------
178
+ columns : List[str], optional
179
+ Specific columns to return. If None, all columns are returned.
180
+
181
+ # Filter parameters
182
+ is_derivative, security_title, etc. : Various filters that can be:
183
+ - str/bool: Exact match
184
+ - List[str]: Match any in list
185
+ - tuple: (min, max) range for numeric/date fields
186
+
187
+ reporting_owner_cik : str or List[str]
188
+ CIK(s) of the reporting insider(s). This is matched against an array in BigQuery.
189
+ Any match within the array will return the record.
190
+
191
+ issuer_cik : str or List[str]
192
+ CIK(s) of the company/companies
193
+
194
+ api_key : str, optional
195
+ SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
196
+ print_cost : bool
197
+ Whether to print the query cost information
198
+ verbose : bool
199
+ Whether to print additional information about the query
200
+
201
+ Returns:
202
+ --------
203
+ List[Dict]
204
+ A list of dictionaries containing the query results
205
+
206
+ Raises:
207
+ -------
208
+ ValueError
209
+ If API key is missing or invalid
210
+ Exception
211
+ For API errors or other issues
212
+ """
213
+
214
+ # 1. Handle API key
215
+ if api_key is None:
216
+ api_key = os.getenv('DATAMULE_API_KEY')
217
+
218
+ if not api_key:
219
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key parameter")
220
+
221
+ # 2. Build query parameters
222
+ params = {'table_type': 'FORM_345_TABLE'}
223
+
224
+ # Add columns parameter if provided
225
+ if columns:
226
+ if isinstance(columns, list):
227
+ params['columns'] = ','.join(columns)
228
+ else:
229
+ params['columns'] = columns
230
+
231
+ # Map Python parameter names to API parameter names
232
+ param_mapping = {
233
+ 'is_derivative': 'isDerivative',
234
+ 'is_non_derivative': 'isNonDerivative',
235
+ 'security_title': 'securityTitle',
236
+ 'transaction_date': 'transactionDate',
237
+ 'document_type': 'documentType',
238
+ 'transaction_code': 'transactionCode',
239
+ 'equity_swap_involved': 'equitySwapInvolved',
240
+ 'transaction_timeliness': 'transactionTimeliness',
241
+ 'transaction_shares': 'transactionShares',
242
+ 'transaction_price_per_share': 'transactionPricePerShare',
243
+ 'shares_owned_following_transaction': 'sharesOwnedFollowingTransaction',
244
+ 'ownership_type': 'ownershipType',
245
+ 'deemed_execution_date': 'deemedExecutionDate',
246
+ 'conversion_or_exercise_price': 'conversionOrExercisePrice',
247
+ 'exercise_date': 'exerciseDate',
248
+ 'expiration_date': 'expirationDate',
249
+ 'underlying_security_title': 'underlyingSecurityTitle',
250
+ 'underlying_security_shares': 'underlyingSecurityShares',
251
+ 'underlying_security_value': 'underlyingSecurityValue',
252
+ 'accession': 'accession',
253
+ 'reporting_owner_cik': 'reportingOwnerCIK',
254
+ 'issuer_cik': 'issuerCIK',
255
+ 'filing_date': 'filingDate'
256
+ }
257
+
258
+ # Process all possible filter parameters
259
+ date_params = ['transaction_date', 'filing_date', 'deemed_execution_date', 'exercise_date', 'expiration_date']
260
+ boolean_params = ['is_derivative', 'is_non_derivative']
261
+
262
+ for param_name, api_param_name in param_mapping.items():
263
+ value = locals()[param_name]
264
+ if value is not None:
265
+ # Handle different filter types
266
+ if isinstance(value, list):
267
+ # List filter
268
+ params[api_param_name] = f"[{','.join(str(v) for v in value)}]"
269
+ elif isinstance(value, tuple):
270
+ # Range filter
271
+ if len(value) == 2:
272
+ min_val, max_val = value
273
+ # Handle date range specially
274
+ if param_name in date_params:
275
+ # Dates need to be in quotes within the parentheses
276
+ if min_val is None:
277
+ min_val = ''
278
+ else:
279
+ min_val = f"'{min_val}'"
280
+
281
+ if max_val is None:
282
+ max_val = ''
283
+ else:
284
+ max_val = f"'{max_val}'"
285
+
286
+ range_str = f"({min_val},{max_val})"
287
+ params[api_param_name] = range_str
288
+ else:
289
+ raise ValueError(f"Range filter for {param_name} must be a tuple of (min, max)")
290
+ elif param_name in boolean_params:
291
+ # Boolean values
292
+ params[api_param_name] = str(value).lower()
293
+ else:
294
+ # Exact match
295
+ params[api_param_name] = value
296
+
297
+ # Call common function to make API request
298
+ return _make_api_request(params, api_key, print_cost, verbose)
299
+
300
+ def _make_api_request(params, api_key, print_cost=True, verbose=False):
301
+ """
302
+ Common function to make API requests to the SEC BigQuery API.
303
+
304
+ Parameters:
305
+ -----------
306
+ params : dict
307
+ Query parameters
308
+ api_key : str
309
+ API key for authentication
310
+ print_cost : bool
311
+ Whether to print cost information
312
+ verbose : bool
313
+ Whether to print debugging information
314
+
315
+ Returns:
316
+ --------
317
+ List[Dict]
318
+ Data returned from the API
319
+
320
+ Raises:
321
+ -------
322
+ ValueError
323
+ If API key is invalid
324
+ Exception
325
+ For other API errors
326
+ """
327
+ # Make the API request
142
328
  BASE_URL = "https://sec-bq.jgfriedman99.workers.dev/"
143
329
 
144
330
  headers = {
@@ -166,7 +352,15 @@ def get_information_table(
166
352
  # Extract metadata for cost reporting
167
353
  metadata = result.get('metadata', {})
168
354
 
169
- # 5. Print cost information if requested
355
+ # Process the data to handle array fields
356
+ data = result.get('data', [])
357
+ for row in data:
358
+ # Check if reportingOwnerCIK is an array that needs processing
359
+ if 'reportingOwnerCIK' in row and isinstance(row['reportingOwnerCIK'], list):
360
+ # Transform from [{'v': 'value1'}, {'v': 'value2'}] to comma-separated string
361
+ row['reportingOwnerCIK'] = ','.join([item['v'] for item in row['reportingOwnerCIK'] if 'v' in item])
362
+
363
+ # Print cost information if requested
170
364
  if print_cost and 'billing' in metadata:
171
365
  billing = metadata['billing']
172
366
  query_info = metadata.get('query_info', {})
@@ -181,11 +375,154 @@ def get_information_table(
181
375
  print(f"Cache Hit: {query_info.get('cache_hit', False)}")
182
376
  print("==============================\n")
183
377
 
184
- # 6. Return data
185
- return result.get('data', [])
378
+ # Return data
379
+ return data
186
380
 
187
381
  except requests.exceptions.RequestException as e:
188
- if response.status_code == 401:
382
+ if hasattr(e, 'response') and e.response is not None and e.response.status_code == 401:
189
383
  raise ValueError("Authentication failed: Invalid API key")
190
384
  else:
191
- raise Exception(f"Request failed: {str(e)}")
385
+ raise Exception(f"Request failed: {str(e)}")
386
+
387
+ def get_proxy_voting_record(
388
+ # Optional filtering parameters
389
+ columns=None,
390
+ meeting_date=None,
391
+ isin=None,
392
+ cusip=None,
393
+ issuer_name=None,
394
+ vote_description=None,
395
+ shares_on_loan=None,
396
+ shares_voted=None,
397
+ vote_category=None,
398
+ vote_record=None,
399
+ vote_source=None,
400
+ how_voted=None,
401
+ figi=None,
402
+ management_recommendation=None,
403
+ accession=None,
404
+ reporting_owner_cik=None,
405
+ filing_date=None,
406
+
407
+ # API key handling
408
+ api_key=None,
409
+
410
+ # Additional options
411
+ print_cost=True,
412
+ verbose=False
413
+ ):
414
+ """
415
+ Query the SEC BigQuery API for NPX proxy voting record data.
416
+
417
+ Parameters:
418
+ -----------
419
+ columns : List[str], optional
420
+ Specific columns to return. If None, all columns are returned.
421
+
422
+ # Filter parameters
423
+ meeting_date, isin, cusip, etc. : Various filters that can be:
424
+ - str: Exact match
425
+ - List[str]: Match any in list
426
+ - tuple: (min, max) range for numeric/date fields
427
+
428
+ shares_on_loan, shares_voted : int/float or tuple
429
+ Numeric values or (min, max) range
430
+
431
+ filing_date : str or tuple
432
+ Date string in 'YYYY-MM-DD' format or (start_date, end_date) tuple
433
+
434
+ api_key : str, optional
435
+ SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
436
+ print_cost : bool
437
+ Whether to print the query cost information
438
+ verbose : bool
439
+ Whether to print additional information about the query
440
+
441
+ Returns:
442
+ --------
443
+ List[Dict]
444
+ A list of dictionaries containing the query results
445
+
446
+ Raises:
447
+ -------
448
+ ValueError
449
+ If API key is missing or invalid
450
+ Exception
451
+ For API errors or other issues
452
+ """
453
+
454
+ # 1. Handle API key
455
+ if api_key is None:
456
+ api_key = os.getenv('DATAMULE_API_KEY')
457
+
458
+ if not api_key:
459
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key parameter")
460
+
461
+ # 2. Build query parameters
462
+ params = {'table_type': 'NPX_VOTING_TABLE'}
463
+
464
+ # Add columns parameter if provided
465
+ if columns:
466
+ if isinstance(columns, list):
467
+ params['columns'] = ','.join(columns)
468
+ else:
469
+ params['columns'] = columns
470
+
471
+ # Map Python parameter names to API parameter names
472
+ param_mapping = {
473
+ 'meeting_date': 'meetingDate',
474
+ 'isin': 'isin',
475
+ 'cusip': 'cusip',
476
+ 'issuer_name': 'issuerName',
477
+ 'vote_description': 'voteDescription',
478
+ 'shares_on_loan': 'sharesOnLoan',
479
+ 'shares_voted': 'sharesVoted',
480
+ 'vote_category': 'voteCategory',
481
+ 'vote_record': 'voteRecord',
482
+ 'vote_source': 'voteSource',
483
+ 'how_voted': 'howVoted',
484
+ 'figi': 'figi',
485
+ 'management_recommendation': 'managementRecommendation',
486
+ 'accession': 'accession',
487
+ 'reporting_owner_cik': 'reportingOwnerCIK',
488
+ 'filing_date': 'filingDate'
489
+ }
490
+
491
+ # Process all possible filter parameters
492
+ date_params = ['meeting_date', 'filing_date']
493
+ numeric_params = ['shares_on_loan', 'shares_voted']
494
+
495
+ for param_name, api_param_name in param_mapping.items():
496
+ value = locals()[param_name]
497
+ if value is not None:
498
+ # Handle different filter types
499
+ if isinstance(value, list):
500
+ # List filter
501
+ params[api_param_name] = f"[{','.join(str(v) for v in value)}]"
502
+ elif isinstance(value, tuple):
503
+ # Range filter
504
+ if len(value) == 2:
505
+ min_val, max_val = value
506
+ # Handle date range specially
507
+ if param_name in date_params:
508
+ # Dates need to be in quotes within the parentheses
509
+ if min_val is None:
510
+ min_val = ''
511
+ else:
512
+ min_val = f"'{min_val}'"
513
+
514
+ if max_val is None:
515
+ max_val = ''
516
+ else:
517
+ max_val = f"'{max_val}'"
518
+
519
+ range_str = f"({min_val},{max_val})"
520
+ params[api_param_name] = range_str
521
+ else:
522
+ raise ValueError(f"Range filter for {param_name} must be a tuple of (min, max)")
523
+ else:
524
+ # Exact match
525
+ params[api_param_name] = value
526
+
527
+ # Call common function to make API request
528
+ return _make_api_request(params, api_key, print_cost, verbose)
@@ -16,17 +16,35 @@ from threading import Thread
16
16
  from secsgml import parse_sgml_submission
17
17
  from .query import query
18
18
  from os import cpu_count
19
+ from ..submission import Submission
19
20
 
20
21
  class Downloader:
21
22
  def __init__(self, api_key=None):
22
23
  self.BASE_URL = "https://library.datamule.xyz/original/nc/"
23
24
  self.CHUNK_SIZE = 2 * 1024 * 1024
24
- self.MAX_CONCURRENT_DOWNLOADS = 250
25
+ self.MAX_CONCURRENT_DOWNLOADS = 100
25
26
  self.MAX_DECOMPRESSION_WORKERS = cpu_count()
26
27
  self.MAX_PROCESSING_WORKERS = cpu_count()
27
28
  self.QUEUE_SIZE = 10
28
29
  if api_key is not None:
29
30
  self._api_key = api_key
31
+ # Create a shared event loop for async operations
32
+ self.loop = asyncio.new_event_loop()
33
+ # Create a thread to run the event loop
34
+ self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
35
+ self.loop_thread.start()
36
+ # Create a queue for async tasks
37
+ self.async_queue = Queue()
38
+
39
+ def _run_event_loop(self):
40
+ """Run the event loop in a separate thread"""
41
+ asyncio.set_event_loop(self.loop)
42
+ self.loop.run_forever()
43
+
44
+ def _run_coroutine(self, coro):
45
+ """Run a coroutine in the event loop and return its result"""
46
+ future = asyncio.run_coroutine_threadsafe(coro, self.loop)
47
+ return future.result()
30
48
 
31
49
  @property
32
50
  def api_key(self):
@@ -55,7 +73,7 @@ class Downloader:
55
73
  print(f"Failed to log error to {error_file}: {str(e)}")
56
74
 
57
75
  class FileProcessor:
58
- def __init__(self, output_dir, max_workers, queue_size, pbar, downloader):
76
+ def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=None):
59
77
  self.processing_queue = Queue(maxsize=queue_size)
60
78
  self.should_stop = False
61
79
  self.processing_workers = []
@@ -64,6 +82,7 @@ class Downloader:
64
82
  self.batch_size = 50
65
83
  self.pbar = pbar
66
84
  self.downloader = downloader
85
+ self.keep_document_types = keep_document_types
67
86
 
68
87
  def start_processing_workers(self):
69
88
  for _ in range(self.max_workers):
@@ -75,7 +94,9 @@ class Downloader:
75
94
  def _process_file(self, item):
76
95
  filename, content = item
77
96
  try:
78
- parse_sgml_submission(output_dir=self.output_dir, content=content)
97
+ submission = Submission(sgml_content=content, keep_document_types=self.keep_document_types)
98
+ # Use the shared event loop to run save_async
99
+ self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
79
100
  self.pbar.update(1)
80
101
  except Exception as e:
81
102
  accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
@@ -189,11 +210,11 @@ class Downloader:
189
210
  except Exception as e:
190
211
  self._log_error(output_dir, filename, str(e))
191
212
 
192
- async def process_batch(self, urls, output_dir):
213
+ async def process_batch(self, urls, output_dir, keep_document_types=None):
193
214
  os.makedirs(output_dir, exist_ok=True)
194
215
 
195
216
  with tqdm(total=len(urls), desc="Processing files") as pbar:
196
- processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self)
217
+ processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types)
197
218
  processor.start_processing_workers()
198
219
 
199
220
  semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
@@ -216,7 +237,7 @@ class Downloader:
216
237
  processor.stop_workers()
217
238
  decompression_pool.shutdown()
218
239
 
219
- def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None):
240
+ def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
220
241
  """
221
242
  Query SEC filings and download/process them.
222
243
 
@@ -225,6 +246,8 @@ class Downloader:
225
246
  - cik: Company CIK number(s), string, int, or list
226
247
  - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
227
248
  - output_dir: Directory to save downloaded files
249
+ - accession_numbers: List of specific accession numbers to download
250
+ - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
228
251
  """
229
252
  if self.api_key is None:
230
253
  raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
@@ -262,15 +285,32 @@ class Downloader:
262
285
  start_time = time.time()
263
286
 
264
287
  # Process the batch asynchronously
265
- asyncio.run(self.process_batch(urls, output_dir))
288
+ asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types))
266
289
 
267
290
  # Calculate and display performance metrics
268
291
  elapsed_time = time.time() - start_time
269
292
  print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
270
293
  print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
294
+
295
+ def __del__(self):
296
+ """Cleanup when the downloader is garbage collected"""
297
+ if hasattr(self, 'loop') and self.loop.is_running():
298
+ self.loop.call_soon_threadsafe(self.loop.stop)
271
299
 
272
300
 
273
- def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None):
301
+ def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
302
+ """
303
+ Query SEC filings and download/process them.
304
+
305
+ Parameters:
306
+ - submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
307
+ - cik: Company CIK number(s), string, int, or list
308
+ - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
309
+ - api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
310
+ - output_dir: Directory to save downloaded files
311
+ - accession_numbers: List of specific accession numbers to download
312
+ - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
313
+ """
274
314
  if accession_numbers:
275
315
  accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
276
316
  # check if acc no is empty list
@@ -282,5 +322,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
282
322
  cik=cik,
283
323
  filing_date=filing_date,
284
324
  output_dir=output_dir,
285
- accession_numbers=accession_numbers
325
+ accession_numbers=accession_numbers,
326
+ keep_document_types=keep_document_types
286
327
  )