datamule 1.1.8__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +3 -1
- datamule/document/__init__.py +0 -0
- datamule/document/document.py +255 -0
- datamule/document/processing.py +604 -0
- datamule/document/table.py +260 -0
- datamule/package_updater.py +31 -0
- datamule/portfolio.py +5 -3
- datamule/sec/submissions/downloader.py +14 -37
- datamule/seclibrary/bq.py +349 -12
- datamule/seclibrary/downloader.py +50 -9
- datamule/sheet.py +458 -34
- datamule/submission.py +102 -7
- {datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/METADATA +1 -1
- {datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/RECORD +16 -12
- {datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/WHEEL +1 -1
- datamule/document.py +0 -472
- {datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/top_level.txt +0 -0
datamule/sheet.py
CHANGED
@@ -3,7 +3,7 @@ import csv
|
|
3
3
|
import os
|
4
4
|
from .helper import _process_cik_and_metadata_filters, load_package_dataset
|
5
5
|
from .sec.xbrl.downloadcompanyfacts import download_company_facts
|
6
|
-
from .seclibrary.bq import get_information_table
|
6
|
+
from .seclibrary.bq import get_information_table, get_345, get_proxy_voting_record
|
7
7
|
|
8
8
|
class Sheet:
|
9
9
|
def __init__(self, path):
|
@@ -31,9 +31,6 @@ class Sheet:
|
|
31
31
|
|
32
32
|
def get_information_table(
|
33
33
|
self,
|
34
|
-
# Required parameters
|
35
|
-
table_type="INFORMATION_TABLE",
|
36
|
-
|
37
34
|
# Optional filtering parameters
|
38
35
|
columns=None,
|
39
36
|
name_of_issuer=None,
|
@@ -65,8 +62,6 @@ class Sheet:
|
|
65
62
|
|
66
63
|
Parameters:
|
67
64
|
-----------
|
68
|
-
table_type : str
|
69
|
-
The table to query (default is "INFORMATION_TABLE")
|
70
65
|
columns : List[str], optional
|
71
66
|
Specific columns to return. If None, all columns are returned.
|
72
67
|
|
@@ -97,7 +92,6 @@ class Sheet:
|
|
97
92
|
"""
|
98
93
|
|
99
94
|
return get_information_table(
|
100
|
-
table_type=table_type,
|
101
95
|
columns=columns,
|
102
96
|
name_of_issuer=name_of_issuer,
|
103
97
|
title_of_class=title_of_class,
|
@@ -124,12 +118,164 @@ class Sheet:
|
|
124
118
|
verbose=verbose
|
125
119
|
)
|
126
120
|
|
121
|
+
def get_345(
|
122
|
+
self,
|
123
|
+
# Optional filtering parameters
|
124
|
+
columns=None,
|
125
|
+
is_derivative=None,
|
126
|
+
is_non_derivative=None,
|
127
|
+
security_title=None,
|
128
|
+
transaction_date=None,
|
129
|
+
document_type=None,
|
130
|
+
transaction_code=None,
|
131
|
+
equity_swap_involved=None,
|
132
|
+
transaction_timeliness=None,
|
133
|
+
transaction_shares=None,
|
134
|
+
transaction_price_per_share=None,
|
135
|
+
shares_owned_following_transaction=None,
|
136
|
+
ownership_type=None,
|
137
|
+
deemed_execution_date=None,
|
138
|
+
conversion_or_exercise_price=None,
|
139
|
+
exercise_date=None,
|
140
|
+
expiration_date=None,
|
141
|
+
underlying_security_title=None,
|
142
|
+
underlying_security_shares=None,
|
143
|
+
underlying_security_value=None,
|
144
|
+
accession=None,
|
145
|
+
reporting_owner_cik=None,
|
146
|
+
issuer_cik=None,
|
147
|
+
filing_date=None,
|
148
|
+
|
149
|
+
# API key handling
|
150
|
+
api_key=None,
|
151
|
+
|
152
|
+
# Additional options
|
153
|
+
print_cost=True,
|
154
|
+
verbose=False
|
155
|
+
):
|
156
|
+
"""
|
157
|
+
Query the SEC BigQuery API for Form 345 insider transaction data.
|
158
|
+
|
159
|
+
Parameters:
|
160
|
+
-----------
|
161
|
+
columns : List[str], optional
|
162
|
+
Specific columns to return. If None, all columns are returned.
|
163
|
+
|
164
|
+
# Filter parameters
|
165
|
+
is_derivative, security_title, etc. : Various filters that can be:
|
166
|
+
- str/bool: Exact match
|
167
|
+
- List[str]: Match any in list
|
168
|
+
- tuple: (min, max) range for numeric/date fields
|
169
|
+
|
170
|
+
reporting_owner_cik : str or List[str]
|
171
|
+
CIK(s) of the reporting insider(s). This is matched against an array in BigQuery.
|
172
|
+
Any match within the array will return the record.
|
173
|
+
|
174
|
+
issuer_cik : str or List[str]
|
175
|
+
CIK(s) of the company/companies
|
176
|
+
|
177
|
+
api_key : str, optional
|
178
|
+
SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
|
179
|
+
print_cost : bool
|
180
|
+
Whether to print the query cost information
|
181
|
+
verbose : bool
|
182
|
+
Whether to print additional information about the query
|
183
|
+
|
184
|
+
Returns:
|
185
|
+
--------
|
186
|
+
List[Dict]
|
187
|
+
A list of dictionaries containing the query results
|
188
|
+
|
189
|
+
Raises:
|
190
|
+
-------
|
191
|
+
ValueError
|
192
|
+
If API key is missing or invalid
|
193
|
+
Exception
|
194
|
+
For API errors or other issues
|
195
|
+
"""
|
196
|
+
|
197
|
+
return get_345(
|
198
|
+
columns=columns,
|
199
|
+
is_derivative=is_derivative,
|
200
|
+
is_non_derivative=is_non_derivative,
|
201
|
+
security_title=security_title,
|
202
|
+
transaction_date=transaction_date,
|
203
|
+
document_type=document_type,
|
204
|
+
transaction_code=transaction_code,
|
205
|
+
equity_swap_involved=equity_swap_involved,
|
206
|
+
transaction_timeliness=transaction_timeliness,
|
207
|
+
transaction_shares=transaction_shares,
|
208
|
+
transaction_price_per_share=transaction_price_per_share,
|
209
|
+
shares_owned_following_transaction=shares_owned_following_transaction,
|
210
|
+
ownership_type=ownership_type,
|
211
|
+
deemed_execution_date=deemed_execution_date,
|
212
|
+
conversion_or_exercise_price=conversion_or_exercise_price,
|
213
|
+
exercise_date=exercise_date,
|
214
|
+
expiration_date=expiration_date,
|
215
|
+
underlying_security_title=underlying_security_title,
|
216
|
+
underlying_security_shares=underlying_security_shares,
|
217
|
+
underlying_security_value=underlying_security_value,
|
218
|
+
accession=accession,
|
219
|
+
reporting_owner_cik=reporting_owner_cik,
|
220
|
+
issuer_cik=issuer_cik,
|
221
|
+
filing_date=filing_date,
|
222
|
+
|
223
|
+
# API key handling
|
224
|
+
api_key=api_key,
|
225
|
+
|
226
|
+
# Additional options
|
227
|
+
print_cost=print_cost,
|
228
|
+
verbose=verbose
|
229
|
+
)
|
230
|
+
|
231
|
+
def _download_to_csv(self, data, filepath, verbose=False):
|
232
|
+
"""
|
233
|
+
Helper method to download data to a CSV file.
|
234
|
+
|
235
|
+
Parameters:
|
236
|
+
-----------
|
237
|
+
data : List[Dict]
|
238
|
+
The data to save
|
239
|
+
filepath : str or Path
|
240
|
+
Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
|
241
|
+
verbose : bool
|
242
|
+
Whether to print additional information
|
243
|
+
|
244
|
+
Returns:
|
245
|
+
--------
|
246
|
+
List[Dict]
|
247
|
+
The input data (for method chaining)
|
248
|
+
"""
|
249
|
+
# If no data returned, nothing to save
|
250
|
+
if not data:
|
251
|
+
if verbose:
|
252
|
+
print("No data returned from API. No file was created.")
|
253
|
+
return data
|
254
|
+
|
255
|
+
# Resolve filepath - if it's not absolute, make it relative to self.path
|
256
|
+
filepath_obj = Path(filepath)
|
257
|
+
if not filepath_obj.is_absolute():
|
258
|
+
filepath_obj = self.path / filepath_obj
|
259
|
+
|
260
|
+
# Create directory if it doesn't exist
|
261
|
+
os.makedirs(filepath_obj.parent, exist_ok=True)
|
262
|
+
|
263
|
+
# Get fieldnames from the first record
|
264
|
+
fieldnames = data[0].keys()
|
265
|
+
|
266
|
+
# Write to CSV
|
267
|
+
with open(filepath_obj, 'w', newline='') as csvfile:
|
268
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
269
|
+
writer.writeheader()
|
270
|
+
writer.writerows(data)
|
271
|
+
|
272
|
+
if verbose:
|
273
|
+
print(f"Saved {len(data)} records to {filepath_obj}")
|
274
|
+
|
275
|
+
|
127
276
|
def download_information_table(
|
128
277
|
self,
|
129
278
|
filepath,
|
130
|
-
# Required parameters
|
131
|
-
table_type="INFORMATION_TABLE",
|
132
|
-
|
133
279
|
# Optional filtering parameters
|
134
280
|
columns=None,
|
135
281
|
name_of_issuer=None,
|
@@ -164,8 +310,6 @@ class Sheet:
|
|
164
310
|
filepath : str
|
165
311
|
Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
|
166
312
|
|
167
|
-
table_type : str
|
168
|
-
The table to query (default is "INFORMATION_TABLE")
|
169
313
|
columns : List[str], optional
|
170
314
|
Specific columns to return. If None, all columns are returned.
|
171
315
|
|
@@ -196,7 +340,6 @@ class Sheet:
|
|
196
340
|
"""
|
197
341
|
# Get the data from the API
|
198
342
|
data = self.get_information_table(
|
199
|
-
table_type=table_type,
|
200
343
|
columns=columns,
|
201
344
|
name_of_issuer=name_of_issuer,
|
202
345
|
title_of_class=title_of_class,
|
@@ -219,30 +362,311 @@ class Sheet:
|
|
219
362
|
verbose=verbose
|
220
363
|
)
|
221
364
|
|
222
|
-
#
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
365
|
+
# Save to CSV using the helper method
|
366
|
+
return self._download_to_csv(data, filepath, verbose)
|
367
|
+
|
368
|
+
def download_345(
|
369
|
+
self,
|
370
|
+
filepath,
|
371
|
+
# Optional filtering parameters
|
372
|
+
columns=None,
|
373
|
+
is_derivative=None,
|
374
|
+
is_non_derivative=None,
|
375
|
+
security_title=None,
|
376
|
+
transaction_date=None,
|
377
|
+
document_type=None,
|
378
|
+
transaction_code=None,
|
379
|
+
equity_swap_involved=None,
|
380
|
+
transaction_timeliness=None,
|
381
|
+
transaction_shares=None,
|
382
|
+
transaction_price_per_share=None,
|
383
|
+
shares_owned_following_transaction=None,
|
384
|
+
ownership_type=None,
|
385
|
+
deemed_execution_date=None,
|
386
|
+
conversion_or_exercise_price=None,
|
387
|
+
exercise_date=None,
|
388
|
+
expiration_date=None,
|
389
|
+
underlying_security_title=None,
|
390
|
+
underlying_security_shares=None,
|
391
|
+
underlying_security_value=None,
|
392
|
+
accession=None,
|
393
|
+
reporting_owner_cik=None,
|
394
|
+
issuer_cik=None,
|
395
|
+
filing_date=None,
|
227
396
|
|
228
|
-
#
|
229
|
-
|
230
|
-
if not filepath_obj.is_absolute():
|
231
|
-
filepath_obj = self.path / filepath_obj
|
397
|
+
# API key handling
|
398
|
+
api_key=None,
|
232
399
|
|
233
|
-
#
|
234
|
-
|
400
|
+
# Additional options
|
401
|
+
print_cost=True,
|
402
|
+
verbose=False
|
403
|
+
):
|
404
|
+
"""
|
405
|
+
Query the SEC BigQuery API for Form 345 insider transaction data and save to CSV.
|
235
406
|
|
236
|
-
|
237
|
-
|
407
|
+
Parameters:
|
408
|
+
-----------
|
409
|
+
filepath : str
|
410
|
+
Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
|
238
411
|
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
412
|
+
columns : List[str], optional
|
413
|
+
Specific columns to return. If None, all columns are returned.
|
414
|
+
|
415
|
+
# Filter parameters
|
416
|
+
is_derivative, security_title, etc. : Various filters that can be:
|
417
|
+
- str/bool: Exact match
|
418
|
+
- List[str]: Match any in list
|
419
|
+
- tuple: (min, max) range for numeric/date fields
|
244
420
|
|
245
|
-
|
246
|
-
|
421
|
+
reporting_owner_cik : str or List[str]
|
422
|
+
CIK(s) of the reporting insider(s). This is matched against an array in BigQuery.
|
423
|
+
Any match within the array will return the record.
|
424
|
+
|
425
|
+
issuer_cik : str or List[str]
|
426
|
+
CIK(s) of the company/companies
|
427
|
+
|
428
|
+
api_key : str, optional
|
429
|
+
SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
|
430
|
+
print_cost : bool
|
431
|
+
Whether to print the query cost information
|
432
|
+
verbose : bool
|
433
|
+
Whether to print additional information about the query
|
247
434
|
|
248
|
-
|
435
|
+
Returns:
|
436
|
+
--------
|
437
|
+
List[Dict]
|
438
|
+
A list of dictionaries containing the query results
|
439
|
+
|
440
|
+
Raises:
|
441
|
+
-------
|
442
|
+
ValueError
|
443
|
+
If API key is missing or invalid
|
444
|
+
Exception
|
445
|
+
For API errors or other issues
|
446
|
+
"""
|
447
|
+
# Get the data from the API
|
448
|
+
data = self.get_345(
|
449
|
+
columns=columns,
|
450
|
+
is_derivative=is_derivative,
|
451
|
+
is_non_derivative=is_non_derivative,
|
452
|
+
security_title=security_title,
|
453
|
+
transaction_date=transaction_date,
|
454
|
+
document_type=document_type,
|
455
|
+
transaction_code=transaction_code,
|
456
|
+
equity_swap_involved=equity_swap_involved,
|
457
|
+
transaction_timeliness=transaction_timeliness,
|
458
|
+
transaction_shares=transaction_shares,
|
459
|
+
transaction_price_per_share=transaction_price_per_share,
|
460
|
+
shares_owned_following_transaction=shares_owned_following_transaction,
|
461
|
+
ownership_type=ownership_type,
|
462
|
+
deemed_execution_date=deemed_execution_date,
|
463
|
+
conversion_or_exercise_price=conversion_or_exercise_price,
|
464
|
+
exercise_date=exercise_date,
|
465
|
+
expiration_date=expiration_date,
|
466
|
+
underlying_security_title=underlying_security_title,
|
467
|
+
underlying_security_shares=underlying_security_shares,
|
468
|
+
underlying_security_value=underlying_security_value,
|
469
|
+
accession=accession,
|
470
|
+
reporting_owner_cik=reporting_owner_cik,
|
471
|
+
issuer_cik=issuer_cik,
|
472
|
+
filing_date=filing_date,
|
473
|
+
api_key=api_key,
|
474
|
+
print_cost=print_cost,
|
475
|
+
verbose=verbose
|
476
|
+
)
|
477
|
+
|
478
|
+
# Save to CSV using the helper method
|
479
|
+
return self._download_to_csv(data, filepath, verbose)
|
480
|
+
|
481
|
+
def get_proxy_voting_record(
|
482
|
+
self,
|
483
|
+
# Optional filtering parameters
|
484
|
+
columns=None,
|
485
|
+
meeting_date=None,
|
486
|
+
isin=None,
|
487
|
+
cusip=None,
|
488
|
+
issuer_name=None,
|
489
|
+
vote_description=None,
|
490
|
+
shares_on_loan=None,
|
491
|
+
shares_voted=None,
|
492
|
+
vote_category=None,
|
493
|
+
vote_record=None,
|
494
|
+
vote_source=None,
|
495
|
+
how_voted=None,
|
496
|
+
figi=None,
|
497
|
+
management_recommendation=None,
|
498
|
+
accession=None,
|
499
|
+
reporting_owner_cik=None,
|
500
|
+
filing_date=None,
|
501
|
+
|
502
|
+
# API key handling
|
503
|
+
api_key=None,
|
504
|
+
|
505
|
+
# Additional options
|
506
|
+
print_cost=True,
|
507
|
+
verbose=False
|
508
|
+
):
|
509
|
+
"""
|
510
|
+
Query the SEC BigQuery API for NPX proxy voting record data.
|
511
|
+
|
512
|
+
Parameters:
|
513
|
+
-----------
|
514
|
+
columns : List[str], optional
|
515
|
+
Specific columns to return. If None, all columns are returned.
|
516
|
+
|
517
|
+
# Filter parameters
|
518
|
+
meeting_date, isin, cusip, etc. : Various filters that can be:
|
519
|
+
- str: Exact match
|
520
|
+
- List[str]: Match any in list
|
521
|
+
- tuple: (min, max) range for numeric/date fields
|
522
|
+
|
523
|
+
shares_on_loan, shares_voted : int/float or tuple
|
524
|
+
Numeric values or (min, max) range
|
525
|
+
|
526
|
+
filing_date : str or tuple
|
527
|
+
Date string in 'YYYY-MM-DD' format or (start_date, end_date) tuple
|
528
|
+
|
529
|
+
api_key : str, optional
|
530
|
+
SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
|
531
|
+
print_cost : bool
|
532
|
+
Whether to print the query cost information
|
533
|
+
verbose : bool
|
534
|
+
Whether to print additional information about the query
|
535
|
+
|
536
|
+
Returns:
|
537
|
+
--------
|
538
|
+
List[Dict]
|
539
|
+
A list of dictionaries containing the query results
|
540
|
+
|
541
|
+
Raises:
|
542
|
+
-------
|
543
|
+
ValueError
|
544
|
+
If API key is missing or invalid
|
545
|
+
Exception
|
546
|
+
For API errors or other issues
|
547
|
+
"""
|
548
|
+
|
549
|
+
return get_proxy_voting_record(
|
550
|
+
columns=columns,
|
551
|
+
meeting_date=meeting_date,
|
552
|
+
isin=isin,
|
553
|
+
cusip=cusip,
|
554
|
+
issuer_name=issuer_name,
|
555
|
+
vote_description=vote_description,
|
556
|
+
shares_on_loan=shares_on_loan,
|
557
|
+
shares_voted=shares_voted,
|
558
|
+
vote_category=vote_category,
|
559
|
+
vote_record=vote_record,
|
560
|
+
vote_source=vote_source,
|
561
|
+
how_voted=how_voted,
|
562
|
+
figi=figi,
|
563
|
+
management_recommendation=management_recommendation,
|
564
|
+
accession=accession,
|
565
|
+
reporting_owner_cik=reporting_owner_cik,
|
566
|
+
filing_date=filing_date,
|
567
|
+
|
568
|
+
# API key handling
|
569
|
+
api_key=api_key,
|
570
|
+
|
571
|
+
# Additional options
|
572
|
+
print_cost=print_cost,
|
573
|
+
verbose=verbose
|
574
|
+
)
|
575
|
+
|
576
|
+
def download_proxy_voting_record(
|
577
|
+
self,
|
578
|
+
filepath,
|
579
|
+
# Optional filtering parameters
|
580
|
+
columns=None,
|
581
|
+
meeting_date=None,
|
582
|
+
isin=None,
|
583
|
+
cusip=None,
|
584
|
+
issuer_name=None,
|
585
|
+
vote_description=None,
|
586
|
+
shares_on_loan=None,
|
587
|
+
shares_voted=None,
|
588
|
+
vote_category=None,
|
589
|
+
vote_record=None,
|
590
|
+
vote_source=None,
|
591
|
+
how_voted=None,
|
592
|
+
figi=None,
|
593
|
+
management_recommendation=None,
|
594
|
+
accession=None,
|
595
|
+
reporting_owner_cik=None,
|
596
|
+
filing_date=None,
|
597
|
+
|
598
|
+
# API key handling
|
599
|
+
api_key=None,
|
600
|
+
|
601
|
+
# Additional options
|
602
|
+
print_cost=True,
|
603
|
+
verbose=False
|
604
|
+
):
|
605
|
+
"""
|
606
|
+
Query the SEC BigQuery API for NPX proxy voting record data and save to CSV.
|
607
|
+
|
608
|
+
Parameters:
|
609
|
+
-----------
|
610
|
+
filepath : str
|
611
|
+
Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
|
612
|
+
|
613
|
+
columns : List[str], optional
|
614
|
+
Specific columns to return. If None, all columns are returned.
|
615
|
+
|
616
|
+
# Filter parameters
|
617
|
+
meeting_date, isin, cusip, etc. : Various filters that can be:
|
618
|
+
- str: Exact match
|
619
|
+
- List[str]: Match any in list
|
620
|
+
- tuple: (min, max) range for numeric/date fields
|
621
|
+
|
622
|
+
shares_on_loan, shares_voted : int/float or tuple
|
623
|
+
Numeric values or (min, max) range
|
624
|
+
|
625
|
+
filing_date : str or tuple
|
626
|
+
Date string in 'YYYY-MM-DD' format or (start_date, end_date) tuple
|
627
|
+
|
628
|
+
api_key : str, optional
|
629
|
+
SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
|
630
|
+
print_cost : bool
|
631
|
+
Whether to print the query cost information
|
632
|
+
verbose : bool
|
633
|
+
Whether to print additional information about the query
|
634
|
+
|
635
|
+
Returns:
|
636
|
+
--------
|
637
|
+
List[Dict]
|
638
|
+
A list of dictionaries containing the query results
|
639
|
+
|
640
|
+
Raises:
|
641
|
+
-------
|
642
|
+
ValueError
|
643
|
+
If API key is missing or invalid
|
644
|
+
Exception
|
645
|
+
For API errors or other issues
|
646
|
+
"""
|
647
|
+
# Get the data from the API
|
648
|
+
data = self.get_proxy_voting_record(
|
649
|
+
columns=columns,
|
650
|
+
meeting_date=meeting_date,
|
651
|
+
isin=isin,
|
652
|
+
cusip=cusip,
|
653
|
+
issuer_name=issuer_name,
|
654
|
+
vote_description=vote_description,
|
655
|
+
shares_on_loan=shares_on_loan,
|
656
|
+
shares_voted=shares_voted,
|
657
|
+
vote_category=vote_category,
|
658
|
+
vote_record=vote_record,
|
659
|
+
vote_source=vote_source,
|
660
|
+
how_voted=how_voted,
|
661
|
+
figi=figi,
|
662
|
+
management_recommendation=management_recommendation,
|
663
|
+
accession=accession,
|
664
|
+
reporting_owner_cik=reporting_owner_cik,
|
665
|
+
filing_date=filing_date,
|
666
|
+
api_key=api_key,
|
667
|
+
print_cost=print_cost,
|
668
|
+
verbose=verbose
|
669
|
+
)
|
670
|
+
|
671
|
+
# Save to CSV using the helper method
|
672
|
+
return self._download_to_csv(data, filepath, verbose)
|
datamule/submission.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
import json
|
3
|
-
from .document import Document
|
3
|
+
from .document.document import Document
|
4
4
|
from secsgml import parse_sgml_submission_into_memory
|
5
|
-
|
5
|
+
import os
|
6
|
+
import aiofiles
|
7
|
+
|
6
8
|
|
7
9
|
class Submission:
|
8
10
|
def __init__(self, path=None,sgml_content=None,keep_document_types=None):
|
@@ -14,7 +16,12 @@ class Submission:
|
|
14
16
|
if sgml_content is not None:
|
15
17
|
self.path = None
|
16
18
|
self.metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
|
19
|
+
|
20
|
+
self.accession = self.metadata['accession-number']
|
21
|
+
self.filing_date= f"{self.metadata['filing-date'][:4]}-{self.metadata['filing-date'][4:6]}-{self.metadata['filing-date'][6:8]}"
|
22
|
+
|
17
23
|
self.documents = []
|
24
|
+
filtered_metadata_documents = []
|
18
25
|
|
19
26
|
for idx,doc in enumerate(self.metadata['documents']):
|
20
27
|
type = doc.get('type')
|
@@ -24,15 +31,19 @@ class Submission:
|
|
24
31
|
continue
|
25
32
|
filename = doc.get('filename')
|
26
33
|
extension = Path(filename).suffix
|
27
|
-
self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension))
|
34
|
+
self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
|
28
35
|
|
36
|
+
filtered_metadata_documents.append(doc)
|
37
|
+
|
38
|
+
self.metadata['documents'] = filtered_metadata_documents
|
29
39
|
|
30
40
|
if path is not None:
|
31
41
|
self.path = Path(path)
|
32
42
|
metadata_path = self.path / 'metadata.json'
|
33
43
|
with metadata_path.open('r') as f:
|
34
44
|
self.metadata = json.load(f)
|
35
|
-
|
45
|
+
|
46
|
+
|
36
47
|
|
37
48
|
def document_type(self, document_type):
|
38
49
|
# Convert single document type to list for consistent handling
|
@@ -57,7 +68,7 @@ class Submission:
|
|
57
68
|
with document_path.open('r') as f:
|
58
69
|
content = f.read()
|
59
70
|
|
60
|
-
yield Document(type=doc['type'], content=content, extension=extension)
|
71
|
+
yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
|
61
72
|
# if loaded from sgml_content
|
62
73
|
else:
|
63
74
|
yield self.documents[idx]
|
@@ -81,7 +92,7 @@ class Submission:
|
|
81
92
|
with document_path.open('r') as f:
|
82
93
|
content = f.read()
|
83
94
|
|
84
|
-
yield Document(type=doc['type'], content=content, extension=extension)
|
95
|
+
yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
|
85
96
|
else:
|
86
97
|
print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
|
87
98
|
|
@@ -110,4 +121,88 @@ class Submission:
|
|
110
121
|
# delete the file
|
111
122
|
document_path.unlink()
|
112
123
|
else:
|
113
|
-
print("Warning: keep() method is only available when loading from path.")
|
124
|
+
print("Warning: keep() method is only available when loading from path.")
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
def save(self, output_dir="filings"):
|
129
|
+
file_dir = Path(output_dir) / str(self.accession)
|
130
|
+
file_dir.mkdir(parents=True, exist_ok=True)
|
131
|
+
|
132
|
+
metadata_path = file_dir / "metadata.json"
|
133
|
+
with open(metadata_path, 'w') as f:
|
134
|
+
json.dump(self.metadata, f, indent=4)
|
135
|
+
|
136
|
+
for idx, doc in enumerate(self.metadata['documents']):
|
137
|
+
try:
|
138
|
+
filename = doc.get('filename')
|
139
|
+
if filename is None:
|
140
|
+
filename = f"{doc.get('sequence', idx)}.txt"
|
141
|
+
except (KeyError, IndexError):
|
142
|
+
filename = f"{idx}.txt"
|
143
|
+
|
144
|
+
doc_path = file_dir / filename
|
145
|
+
|
146
|
+
if self.path is not None:
|
147
|
+
if hasattr(self, 'documents') and self.documents:
|
148
|
+
content = self.documents[idx].content
|
149
|
+
else:
|
150
|
+
orig_doc_path = self.path / filename
|
151
|
+
if orig_doc_path.exists():
|
152
|
+
with open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
|
153
|
+
content = f.read()
|
154
|
+
else:
|
155
|
+
print(f"Warning: File {orig_doc_path} does not exist, skipping.")
|
156
|
+
continue
|
157
|
+
else:
|
158
|
+
content = self.documents[idx].content
|
159
|
+
|
160
|
+
if isinstance(content, bytes):
|
161
|
+
with open(doc_path, 'wb') as f:
|
162
|
+
f.write(content)
|
163
|
+
else:
|
164
|
+
with open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
|
165
|
+
f.write(content)
|
166
|
+
|
167
|
+
return file_dir
|
168
|
+
|
169
|
+
async def save_async(self, output_dir="filings"):
|
170
|
+
file_dir = Path(output_dir) / str(self.accession)
|
171
|
+
os.makedirs(file_dir, exist_ok=True)
|
172
|
+
|
173
|
+
metadata_path = file_dir / "metadata.json"
|
174
|
+
async with aiofiles.open(metadata_path, 'w') as f:
|
175
|
+
await f.write(json.dumps(self.metadata, indent=4))
|
176
|
+
|
177
|
+
for idx, doc in enumerate(self.metadata['documents']):
|
178
|
+
try:
|
179
|
+
filename = doc.get('filename')
|
180
|
+
if filename is None:
|
181
|
+
filename = f"{doc.get('sequence', idx)}.txt"
|
182
|
+
except (KeyError, IndexError):
|
183
|
+
filename = f"{idx}.txt"
|
184
|
+
|
185
|
+
doc_path = file_dir / filename
|
186
|
+
|
187
|
+
if self.path is not None:
|
188
|
+
if hasattr(self, 'documents') and self.documents:
|
189
|
+
content = self.documents[idx].content
|
190
|
+
else:
|
191
|
+
orig_doc_path = self.path / filename
|
192
|
+
if orig_doc_path.exists():
|
193
|
+
async with aiofiles.open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
|
194
|
+
content = await f.read()
|
195
|
+
else:
|
196
|
+
print(f"Warning: File {orig_doc_path} does not exist, skipping.")
|
197
|
+
continue
|
198
|
+
else:
|
199
|
+
content = self.documents[idx].content
|
200
|
+
|
201
|
+
if isinstance(content, bytes):
|
202
|
+
async with aiofiles.open(doc_path, 'wb') as f:
|
203
|
+
await f.write(content)
|
204
|
+
else:
|
205
|
+
async with aiofiles.open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
|
206
|
+
await f.write(content)
|
207
|
+
|
208
|
+
return file_dir
|