datamule 1.2.5__tar.gz → 1.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-1.2.5 → datamule-1.2.6}/PKG-INFO +1 -2
- {datamule-1.2.5 → datamule-1.2.6}/datamule/__init__.py +1 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/document.py +10 -6
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/information_table.py +1 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/ownership.py +1 -1
- datamule-1.2.6/datamule/document/mappings/proxy_voting_record.py +17 -0
- datamule-1.2.6/datamule/document/mappings/submission_metadata.py +9 -0
- datamule-1.2.6/datamule/document/mappings/thirteenfhr.py +72 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/twentyfivense.py +1 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/processing.py +35 -5
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/table.py +6 -1
- {datamule-1.2.5 → datamule-1.2.6}/datamule/helper.py +10 -1
- {datamule-1.2.5 → datamule-1.2.6}/datamule/index.py +8 -10
- {datamule-1.2.5 → datamule-1.2.6}/datamule/portfolio.py +16 -11
- datamule-1.2.6/datamule/sec/submissions/monitor.py +183 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sec/submissions/textsearch.py +0 -4
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sec/xbrl/streamcompanyfacts.py +1 -1
- {datamule-1.2.5 → datamule-1.2.6}/datamule/seclibrary/downloader.py +2 -2
- {datamule-1.2.5 → datamule-1.2.6}/datamule/submission.py +80 -14
- {datamule-1.2.5 → datamule-1.2.6}/datamule.egg-info/PKG-INFO +1 -2
- {datamule-1.2.5 → datamule-1.2.6}/datamule.egg-info/SOURCES.txt +1 -2
- {datamule-1.2.5 → datamule-1.2.6}/datamule.egg-info/requires.txt +0 -1
- {datamule-1.2.5 → datamule-1.2.6}/setup.py +1 -2
- datamule-1.2.5/datamule/document/mappings/proxy_voting_record.py +0 -1
- datamule-1.2.5/datamule/document/mappings/thirteenfhr.py +0 -5
- datamule-1.2.5/datamule/sec/rss/monitor.py +0 -416
- datamule-1.2.5/datamule/sec/submissions/monitor.py +0 -130
- datamule-1.2.5/datamule/seclibrary/__init__.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/config.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/__init__.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/__init__.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/atsn.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/cfportal.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/ex99a_sdr.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/ex99c_sdr.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/ex99g_sdr.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/ex99i_sdr.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/nmfp.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/npx.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/onefourtyfour.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/sbs.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/sbsef.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/schedule13.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/sdr.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/ta.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/document/mappings/twentyfourf2nt.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/package_updater.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sec/__init__.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-1.2.5/datamule/sec/rss → datamule-1.2.6/datamule/sec/submissions}/__init__.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sec/utils.py +0 -0
- {datamule-1.2.5/datamule/sec/submissions → datamule-1.2.6/datamule/sec/xbrl}/__init__.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-1.2.5/datamule/sec/xbrl → datamule-1.2.6/datamule/seclibrary}/__init__.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/seclibrary/bq.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/seclibrary/query.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule/sheet.py +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/datamule.egg-info/top_level.txt +0 -0
- {datamule-1.2.5 → datamule-1.2.6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.6
|
4
4
|
Summary: Making it easier to use SEC filings.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
@@ -10,7 +10,6 @@ Requires-Dist: tqdm
|
|
10
10
|
Requires-Dist: requests
|
11
11
|
Requires-Dist: nest_asyncio
|
12
12
|
Requires-Dist: aiofiles
|
13
|
-
Requires-Dist: polars
|
14
13
|
Requires-Dist: setuptools
|
15
14
|
Requires-Dist: selectolax
|
16
15
|
Requires-Dist: pytz
|
@@ -118,10 +118,11 @@ class Document:
|
|
118
118
|
# will deprecate this when we add html2dict
|
119
119
|
elif self.extension in ['.htm', '.html','.txt']:
|
120
120
|
|
121
|
-
|
122
|
-
|
123
|
-
elif self.type == '10-Q':
|
121
|
+
|
122
|
+
if self.type == '10-Q':
|
124
123
|
mapping_dict = dict_10q
|
124
|
+
elif self.type == '10-K':
|
125
|
+
mapping_dict = dict_10k
|
125
126
|
elif self.type == '8-K':
|
126
127
|
mapping_dict = dict_8k
|
127
128
|
elif self.type == 'SC 13D':
|
@@ -141,10 +142,13 @@ class Document:
|
|
141
142
|
json.dump(self.data, f, indent=2)
|
142
143
|
|
143
144
|
def to_tabular(self):
|
144
|
-
if self.
|
145
|
+
if self.type == 'submission_metadata':
|
146
|
+
return process_tabular_data(self)
|
147
|
+
elif self.extension != '.xml':
|
145
148
|
return []
|
146
|
-
|
147
|
-
|
149
|
+
else:
|
150
|
+
self.parse()
|
151
|
+
return process_tabular_data(self)
|
148
152
|
|
149
153
|
|
150
154
|
def write_csv(self, output_folder):
|
@@ -0,0 +1,17 @@
|
|
1
|
+
proxy_voting_record_dict = {
|
2
|
+
'meetingDate': 'meetingDate',
|
3
|
+
'accession': 'accessionNumber',
|
4
|
+
'vote_voteRecord_managementRecommendation': 'managementRecommendation',
|
5
|
+
'sharesVoted': 'sharesVoted', # Top-level sharesVoted
|
6
|
+
'vote_voteRecord_howVoted': 'howVoted',
|
7
|
+
'sharesOnLoan': 'sharesOnLoan',
|
8
|
+
'cusip': 'cusip',
|
9
|
+
'issuerName': 'issuerName',
|
10
|
+
'voteCategories_voteCategory_categoryType': 'categoryType',
|
11
|
+
'voteDescription': 'voteDescription',
|
12
|
+
'voteManager_otherManagers_otherManager': 'otherManager',
|
13
|
+
'vote_voteRecord_sharesVoted': 'recordSharesVoted', # To distinguish from top-level sharesVoted
|
14
|
+
'isin': 'isin',
|
15
|
+
'voteSource': 'voteSource',
|
16
|
+
'voteSeries': 'voteSeries'
|
17
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
# Note: submission_metadata is my designation, not SEC for the header of the Submission tag
|
2
|
+
|
3
|
+
document_submission_metadata_dict = {
|
4
|
+
'accession':'accession',
|
5
|
+
'type':'type',
|
6
|
+
'sequence' : 'sequence',
|
7
|
+
'filename' : 'filename',
|
8
|
+
'description':'description'
|
9
|
+
}
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# Ready for mass testing
|
2
|
+
|
3
|
+
# 13F-HR (Institutional Investment Manager Holdings) mapping
|
4
|
+
thirteenfhr_dict = {
|
5
|
+
# Cover Page Mapping
|
6
|
+
'formData_coverPage_reportCalendarOrQuarter': 'reportCalendarOrQuarter',
|
7
|
+
'formData_coverPage_filingManager_name': 'filingManagerName',
|
8
|
+
'formData_coverPage_filingManager_address_street1': 'filingManagerStreet1',
|
9
|
+
'formData_coverPage_filingManager_address_street2': 'filingManagerStreet2',
|
10
|
+
'formData_coverPage_filingManager_address_city': 'filingManagerCity',
|
11
|
+
'formData_coverPage_filingManager_address_stateOrCountry': 'filingManagerStateOrCountry',
|
12
|
+
'formData_coverPage_filingManager_address_zipCode': 'filingManagerZipCode',
|
13
|
+
'formData_coverPage_crdNumber': 'crdNumber',
|
14
|
+
'formData_coverPage_secFileNumber': 'secFileNumber',
|
15
|
+
'formData_coverPage_form13FFileNumber': 'form13FFileNumber',
|
16
|
+
'formData_coverPage_reportType': 'reportType',
|
17
|
+
'formData_coverPage_isAmendment': 'isAmendment',
|
18
|
+
'formData_coverPage_amendmentNo': 'amendmentNo',
|
19
|
+
'formData_coverPage_amendmentInfo_amendmentType': 'amendmentType',
|
20
|
+
'formData_coverPage_amendmentInfo_confDeniedExpired': 'confDeniedExpired',
|
21
|
+
'formData_coverPage_additionalInformation': 'additionalInformation',
|
22
|
+
'formData_coverPage_provideInfoForInstruction5': 'provideInfoForInstruction5',
|
23
|
+
|
24
|
+
# Other Managers Info Mapping
|
25
|
+
'formData_coverPage_otherManagersInfo_otherManager': 'otherManager',
|
26
|
+
'formData_coverPage_otherManagersInfo_otherManager_cik': 'otherManagerCik',
|
27
|
+
'formData_coverPage_otherManagersInfo_otherManager_name': 'otherManagerName',
|
28
|
+
'formData_coverPage_otherManagersInfo_otherManager_crdNumber': 'otherManagerCrdNumber',
|
29
|
+
'formData_coverPage_otherManagersInfo_otherManager_secFileNumber': 'otherManagerSecFileNumber',
|
30
|
+
'formData_coverPage_otherManagersInfo_otherManager_form13FFileNumber': 'otherManagerForm13FFileNumber',
|
31
|
+
|
32
|
+
# Summary Page Mapping
|
33
|
+
'formData_summaryPage_isConfidentialOmitted': 'isConfidentialOmitted',
|
34
|
+
'formData_summaryPage_otherIncludedManagersCount': 'otherIncludedManagersCount',
|
35
|
+
'formData_summaryPage_tableEntryTotal': 'tableEntryTotal',
|
36
|
+
'formData_summaryPage_tableValueTotal': 'tableValueTotal',
|
37
|
+
|
38
|
+
# Other Managers 2 Info Mapping
|
39
|
+
'formData_summaryPage_otherManagers2Info_otherManager2': 'otherManager2',
|
40
|
+
'formData_summaryPage_otherManagers2Info_otherManager2_sequenceNumber': 'otherManager2SequenceNumber',
|
41
|
+
'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_cik': 'otherManager2Cik',
|
42
|
+
'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_name': 'otherManager2Name',
|
43
|
+
'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_crdNumber': 'otherManager2CrdNumber',
|
44
|
+
'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_secFileNumber': 'otherManager2SecFileNumber',
|
45
|
+
'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_form13FFileNumber': 'otherManager2Form13FFileNumber',
|
46
|
+
|
47
|
+
# Signature Block Mapping
|
48
|
+
'formData_signatureBlock_name': 'signatureName',
|
49
|
+
'formData_signatureBlock_title': 'signatureTitle',
|
50
|
+
'formData_signatureBlock_phone': 'signaturePhone',
|
51
|
+
'formData_signatureBlock_signature': 'signature',
|
52
|
+
'formData_signatureBlock_city': 'signatureCity',
|
53
|
+
'formData_signatureBlock_stateOrCountry': 'signatureStateOrCountry',
|
54
|
+
'formData_signatureBlock_signatureDate': 'signatureDate',
|
55
|
+
|
56
|
+
# Header Data Mapping
|
57
|
+
'headerData_filerInfo_periodOfReport': 'periodOfReport',
|
58
|
+
'headerData_filerInfo_filer_fileNumber': 'filerFileNumber',
|
59
|
+
'headerData_filerInfo_filer_credentials_cik': 'filerCik',
|
60
|
+
'headerData_filerInfo_filer_credentials_ccc': 'filerCcc',
|
61
|
+
'headerData_filerInfo_flags_confirmingCopyFlag': 'confirmingCopyFlag',
|
62
|
+
'headerData_filerInfo_flags_returnCopyFlag': 'returnCopyFlag',
|
63
|
+
'headerData_filerInfo_flags_overrideInternetFlag': 'overrideInternetFlag',
|
64
|
+
'headerData_filerInfo_denovoRequest': 'denovoRequest',
|
65
|
+
'headerData_filerInfo_liveTestFlag': 'liveTestFlag',
|
66
|
+
'headerData_submissionType': 'submissionType',
|
67
|
+
|
68
|
+
# Schema and Metadata Mapping
|
69
|
+
'schemaLocation': 'schemaLocation',
|
70
|
+
'schemaVersion': 'schemaVersion',
|
71
|
+
'accession': 'accessionNumber'
|
72
|
+
}
|
@@ -17,6 +17,12 @@ def process_tabular_data(self):
|
|
17
17
|
tables = process_13fhr(self.data, self.accession)
|
18
18
|
elif self.type in ["INFORMATION TABLE"]:
|
19
19
|
tables = process_information_table(self.data, self.accession)
|
20
|
+
elif self.type in ["25-NSE", "25-NSE/A"]:
|
21
|
+
tables = process_25nse(self.data, self.accession)
|
22
|
+
# complete mark:
|
23
|
+
elif self.type in ["N-PX","N-PX/A"]:
|
24
|
+
tables = process_npx(self.data, self.accession)
|
25
|
+
|
20
26
|
elif self.type in ["SBSEF","SBSEF/A","SBSEF-V","SBSEF-W"]:
|
21
27
|
tables = process_sbsef(self.data, self.accession)
|
22
28
|
elif self.type in ["SDR","SDR/A","SDR-W","SDR-A"]:
|
@@ -33,8 +39,7 @@ def process_tabular_data(self):
|
|
33
39
|
tables = process_144(self.data, self.accession)
|
34
40
|
elif self.type in ["24F-2NT", "24F-2NT/A"]:
|
35
41
|
tables = process_24f2nt(self.data, self.accession)
|
36
|
-
|
37
|
-
tables = process_25nse(self.data, self.accession)
|
42
|
+
|
38
43
|
elif self.type in ["ATS-N", "ATS-N/A"]:
|
39
44
|
tables = process_ats(self.data, self.accession)
|
40
45
|
# elif self.type in ["C","C-W","C-U","C-U-W","C/A","C/A-W",
|
@@ -53,8 +58,7 @@ def process_tabular_data(self):
|
|
53
58
|
# tables = process_nmfp(self.data, self.accession)
|
54
59
|
# elif self.type in ["NPORT-P","NPORT-P/A"]:
|
55
60
|
# tables = process_nportp(self.data, self.accession)
|
56
|
-
|
57
|
-
tables = process_npx(self.data, self.accession)
|
61
|
+
|
58
62
|
# elif self.type in ["TA-1","TA-1/A","TA-W","TA-2","TA-2/A"]:
|
59
63
|
# tables = process_ta(self.data, self.accession)
|
60
64
|
elif self.type in ["X-17A-5","X-17A-5/A"]:
|
@@ -70,6 +74,8 @@ def process_tabular_data(self):
|
|
70
74
|
# tables = process_ex102_abs(self.data, self.accession)
|
71
75
|
elif self.type == "PROXY VOTING RECORD":
|
72
76
|
tables = process_proxy_voting_record(self.data, self.accession)
|
77
|
+
elif self.type == 'submission_metadata':
|
78
|
+
tables = process_submission_metadata(self.content, self.accession)
|
73
79
|
else:
|
74
80
|
warn(f"Processing for {self.type} is not implemented yet.")
|
75
81
|
return []
|
@@ -601,4 +607,28 @@ def process_reg_a(data, accession):
|
|
601
607
|
# raise NotImplementedError("Need to implement the rest of the MA processing")
|
602
608
|
|
603
609
|
# def process_ncen(data, accession):
|
604
|
-
# raise NotImplementedError("Need to implement the N-CEN processing")
|
610
|
+
# raise NotImplementedError("Need to implement the N-CEN processing")
|
611
|
+
|
612
|
+
# WIP
|
613
|
+
# Note: going to pause this for now, as I don't have a great way of putting this in a csv.
|
614
|
+
def process_submission_metadata(data,accession):
|
615
|
+
tables = []
|
616
|
+
document_data = safe_get(data, ['documents'])
|
617
|
+
if document_data:
|
618
|
+
tables.append(Table(_flatten_dict(document_data), 'document_submission_metadata', accession))
|
619
|
+
|
620
|
+
reporting_owner_data = safe_get(data,['reporting-owner'])
|
621
|
+
if reporting_owner_data:
|
622
|
+
tables.append(Table(_flatten_dict(reporting_owner_data), 'reporting_owner_submission_metadata', accession))
|
623
|
+
|
624
|
+
issuer_data = safe_get(data,['issuer'])
|
625
|
+
if issuer_data:
|
626
|
+
tables.append(Table(_flatten_dict(issuer_data), 'issuer_submission_metadata', accession))
|
627
|
+
|
628
|
+
# # construct metadata
|
629
|
+
# accession-number date-of-filing-date-change, depositor-cik effectiveness-date
|
630
|
+
|
631
|
+
# # other tables
|
632
|
+
# depositor, securitizer
|
633
|
+
|
634
|
+
return tables
|
@@ -18,7 +18,7 @@ from .mappings.thirteenfhr import *
|
|
18
18
|
from .mappings.twentyfivense import *
|
19
19
|
from .mappings.twentyfourf2nt import *
|
20
20
|
from .mappings.information_table import *
|
21
|
-
|
21
|
+
from .mappings.submission_metadata import *
|
22
22
|
# need to check if mappings correctly create new columns
|
23
23
|
class Table():
|
24
24
|
def __init__(self, data, type,accession):
|
@@ -228,6 +228,11 @@ class Table():
|
|
228
228
|
elif self.type == 'signature_info_schedule_a':
|
229
229
|
mapping_dict = signature_24f2nt_dict
|
230
230
|
|
231
|
+
# submission metadata
|
232
|
+
elif self.type == 'document_submission_metadata':
|
233
|
+
mapping_dict = document_submission_metadata_dict
|
234
|
+
|
235
|
+
|
231
236
|
else:
|
232
237
|
mapping_dict = {}
|
233
238
|
|
@@ -79,7 +79,16 @@ def _process_cik_and_metadata_filters(cik=None, ticker=None, **kwargs):
|
|
79
79
|
|
80
80
|
# Convert ticker to CIK if provided
|
81
81
|
if ticker is not None:
|
82
|
-
|
82
|
+
if isinstance(ticker, str):
|
83
|
+
ticker = [ticker]
|
84
|
+
|
85
|
+
ciks_from_ticker = []
|
86
|
+
for t in ticker:
|
87
|
+
ciks = get_cik_from_dataset('listed_filer_metadata', 'ticker', t)
|
88
|
+
if ciks:
|
89
|
+
ciks_from_ticker.extend(ciks)
|
90
|
+
|
91
|
+
cik = ciks
|
83
92
|
|
84
93
|
# Normalize CIK format
|
85
94
|
if cik is not None:
|
@@ -1,16 +1,16 @@
|
|
1
|
-
|
1
|
+
|
2
2
|
from .sec.submissions.textsearch import query
|
3
|
-
from .helper import _process_cik_and_metadata_filters
|
3
|
+
from .helper import _process_cik_and_metadata_filters
|
4
|
+
from pathlib import Path
|
4
5
|
|
5
6
|
class Index:
|
6
|
-
def __init__(self
|
7
|
-
|
7
|
+
def __init__(self):
|
8
|
+
pass
|
8
9
|
|
9
10
|
def search_submissions(
|
10
11
|
self,
|
11
12
|
text_query,
|
12
|
-
|
13
|
-
end_date=None,
|
13
|
+
filing_date=None,
|
14
14
|
submission_type=None,
|
15
15
|
cik=None,
|
16
16
|
ticker=None,
|
@@ -47,16 +47,14 @@ class Index:
|
|
47
47
|
# Execute the search query
|
48
48
|
results = query(
|
49
49
|
f'{text_query}',
|
50
|
-
filing_date=
|
50
|
+
filing_date=filing_date,
|
51
51
|
requests_per_second=requests_per_second,
|
52
52
|
quiet=quiet,
|
53
53
|
submission_type=submission_type,
|
54
54
|
**kwargs
|
55
55
|
)
|
56
56
|
|
57
|
-
|
58
|
-
if self.path:
|
59
|
-
self._save_results(results, text_query)
|
57
|
+
|
60
58
|
|
61
59
|
return results
|
62
60
|
|
@@ -9,22 +9,28 @@ import os
|
|
9
9
|
from .helper import _process_cik_and_metadata_filters
|
10
10
|
from .seclibrary.downloader import download as seclibrary_download
|
11
11
|
from .sec.xbrl.filter_xbrl import filter_xbrl
|
12
|
-
from .sec.submissions.monitor import
|
13
|
-
from .sec.xbrl.xbrlmonitor import XBRLMonitor
|
12
|
+
from .sec.submissions.monitor import Monitor
|
13
|
+
#from .sec.xbrl.xbrlmonitor import XBRLMonitor
|
14
14
|
|
15
15
|
|
16
16
|
class Portfolio:
|
17
17
|
def __init__(self, path):
|
18
18
|
self.path = Path(path)
|
19
|
+
self.api_key = None
|
19
20
|
self.submissions = []
|
20
21
|
self.submissions_loaded = False
|
21
22
|
self.MAX_WORKERS = os.cpu_count() - 1
|
23
|
+
|
24
|
+
self.monitor = Monitor()
|
22
25
|
|
23
26
|
if self.path.exists():
|
24
27
|
self._load_submissions()
|
25
28
|
self.submissions_loaded = True
|
26
29
|
else:
|
27
30
|
self.path.mkdir(parents=True, exist_ok=True)
|
31
|
+
|
32
|
+
def set_api_key(self, api_key):
|
33
|
+
self.api_key = api_key
|
28
34
|
|
29
35
|
def _load_submissions(self):
|
30
36
|
folders = [f for f in self.path.iterdir() if f.is_dir()]
|
@@ -132,6 +138,7 @@ class Portfolio:
|
|
132
138
|
seclibrary_download(
|
133
139
|
output_dir=self.path,
|
134
140
|
cik=cik,
|
141
|
+
api_key=self.api_key,
|
135
142
|
submission_type=submission_type,
|
136
143
|
filing_date=filing_date,
|
137
144
|
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
|
@@ -149,20 +156,18 @@ class Portfolio:
|
|
149
156
|
)
|
150
157
|
|
151
158
|
self.submissions_loaded = False
|
152
|
-
def monitor_submissions(self,data_callback=None,
|
153
|
-
|
159
|
+
def monitor_submissions(self, data_callback=None, interval_callback=None,
|
160
|
+
polling_interval=1000, quiet=True, start_date=None,
|
161
|
+
validation_interval=600000):
|
154
162
|
|
155
|
-
cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
|
156
163
|
|
157
|
-
monitor(
|
164
|
+
self.monitor.monitor_submissions(
|
158
165
|
data_callback=data_callback,
|
159
|
-
|
160
|
-
cik=cik,
|
161
|
-
submission_type=submission_type,
|
166
|
+
interval_callback=interval_callback,
|
162
167
|
polling_interval=polling_interval,
|
163
|
-
requests_per_second=requests_per_second,
|
164
168
|
quiet=quiet,
|
165
|
-
start_date=start_date
|
169
|
+
start_date=start_date,
|
170
|
+
validation_interval=validation_interval
|
166
171
|
)
|
167
172
|
|
168
173
|
|
@@ -0,0 +1,183 @@
|
|
1
|
+
import time
|
2
|
+
from collections import deque
|
3
|
+
from datetime import datetime
|
4
|
+
import xml.etree.ElementTree as ET
|
5
|
+
import re
|
6
|
+
import asyncio
|
7
|
+
from ..utils import headers, PreciseRateLimiter
|
8
|
+
from .eftsquery import EFTSQuery
|
9
|
+
import aiohttp
|
10
|
+
|
11
|
+
|
12
|
+
async def poll_rss(limiter):
|
13
|
+
base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
|
14
|
+
|
15
|
+
# Create a session specifically for this RSS polling operation
|
16
|
+
async with aiohttp.ClientSession(headers=headers) as session:
|
17
|
+
# Use the rate limiter before making the request
|
18
|
+
async with limiter:
|
19
|
+
# Make the HTTP request with the session
|
20
|
+
async with session.get(base_url) as response:
|
21
|
+
content = await response.read()
|
22
|
+
|
23
|
+
# Process the content
|
24
|
+
content_str = content.decode('utf-8')
|
25
|
+
root = ET.fromstring(content_str)
|
26
|
+
namespace = {'atom': 'http://www.w3.org/2005/Atom'}
|
27
|
+
entries = root.findall('atom:entry', namespace)
|
28
|
+
grouped = {}
|
29
|
+
|
30
|
+
for entry in entries:
|
31
|
+
url = entry.find('atom:link', namespace).get('href')
|
32
|
+
accession = re.search(r'/(\d{10})-(\d{2})-(\d{6})', url)
|
33
|
+
accession = accession.group(1) + accession.group(2) + accession.group(3)
|
34
|
+
cik = re.search(r'/data/(\d+)/', url).group(1)
|
35
|
+
|
36
|
+
if accession not in grouped:
|
37
|
+
grouped[accession] = {'submission_type': '', 'ciks': set(), 'filing_date': ''}
|
38
|
+
|
39
|
+
grouped[accession]['ciks'].add(cik)
|
40
|
+
grouped[accession]['submission_type'] = entry.find('atom:category', namespace).get('term')
|
41
|
+
summary_text = entry.find('atom:summary', namespace).text
|
42
|
+
filing_date_match = re.search(r'Filed:</b>\s*(\d{4}-\d{2}-\d{2})', summary_text)
|
43
|
+
if filing_date_match:
|
44
|
+
grouped[accession]['filing_date'] = filing_date_match.group(1)
|
45
|
+
|
46
|
+
results = [{'accession': int(k.replace('-', '')), 'submission_type': v['submission_type'], 'ciks': list(v['ciks']), 'filing_date': v['filing_date']} for k, v in grouped.items()]
|
47
|
+
return results
|
48
|
+
|
49
|
+
def clean_efts_hits(hits):
|
50
|
+
# clean hits
|
51
|
+
hits = [{'accession': int(hit['_source']['adsh'].replace('-','')), 'filing_date': hit['_source']['file_date'], 'ciks': hit['_source']['ciks']} for hit in hits]
|
52
|
+
return hits
|
53
|
+
|
54
|
+
class Monitor():
|
55
|
+
def __init__(self):
|
56
|
+
self.accessions = deque(maxlen=50000)
|
57
|
+
self.ratelimiters = {'sec.gov': PreciseRateLimiter(rate=5)}
|
58
|
+
self.efts_query = EFTSQuery(quiet=True)
|
59
|
+
self.efts_query.limiter = self.ratelimiters['sec.gov']
|
60
|
+
|
61
|
+
def set_domain_rate_limit(self, domain, rate):
|
62
|
+
self.ratelimiters[domain] = PreciseRateLimiter(rate=rate)
|
63
|
+
if domain == 'sec.gov':
|
64
|
+
self.efts_query.limiter = self.ratelimiters[domain]
|
65
|
+
|
66
|
+
async def _async_run_efts_query(self, **kwargs):
|
67
|
+
"""Async helper method to run EFTS query without creating a new event loop"""
|
68
|
+
# Make sure to set quiet parameter if provided in kwargs
|
69
|
+
self.efts_query.quiet = kwargs.get('quiet', True)
|
70
|
+
return await self.efts_query.query(
|
71
|
+
cik=kwargs.get('cik'),
|
72
|
+
submission_type=kwargs.get('submission_type'),
|
73
|
+
filing_date=kwargs.get('filing_date'),
|
74
|
+
location=kwargs.get('location'),
|
75
|
+
callback=kwargs.get('callback'),
|
76
|
+
name=kwargs.get('name')
|
77
|
+
)
|
78
|
+
|
79
|
+
async def _async_monitor_submissions(self, data_callback=None, interval_callback=None,
|
80
|
+
polling_interval=1000, quiet=True, start_date=None,
|
81
|
+
validation_interval=60000):
|
82
|
+
"""
|
83
|
+
Async implementation of monitor_submissions.
|
84
|
+
"""
|
85
|
+
|
86
|
+
# Backfill if start_date is provided
|
87
|
+
if start_date is not None:
|
88
|
+
today_date = datetime.now().date().strftime('%Y-%m-%d')
|
89
|
+
if not quiet:
|
90
|
+
print(f"Backfilling from {start_date} to {today_date}")
|
91
|
+
|
92
|
+
hits = clean_efts_hits(await self._async_run_efts_query(
|
93
|
+
filing_date=(start_date, today_date),
|
94
|
+
quiet=quiet
|
95
|
+
))
|
96
|
+
|
97
|
+
new_hits = self._filter_new_accessions(hits)
|
98
|
+
if not quiet:
|
99
|
+
print(f"New submissions found: {len(new_hits)}")
|
100
|
+
if new_hits and data_callback:
|
101
|
+
data_callback(new_hits)
|
102
|
+
|
103
|
+
last_polling_time = time.time()
|
104
|
+
last_validation_time = last_polling_time
|
105
|
+
current_time = last_polling_time
|
106
|
+
|
107
|
+
while True:
|
108
|
+
# RSS polling
|
109
|
+
if not quiet:
|
110
|
+
print(f"Polling RSS feed")
|
111
|
+
results = await poll_rss(self.ratelimiters['sec.gov'])
|
112
|
+
new_results = self._filter_new_accessions(results)
|
113
|
+
if new_results:
|
114
|
+
if not quiet:
|
115
|
+
print(f"Found {len(new_results)} new submissions via RSS")
|
116
|
+
if data_callback:
|
117
|
+
data_callback(new_results)
|
118
|
+
|
119
|
+
# EFTS validation
|
120
|
+
if validation_interval and (current_time - last_validation_time) >= validation_interval/1000:
|
121
|
+
# Get submissions from the last 24 hours for validation
|
122
|
+
today_date = datetime.now().strftime('%Y-%m-%d')
|
123
|
+
if not quiet:
|
124
|
+
print(f"Validating submissions from {today_date}")
|
125
|
+
|
126
|
+
hits = clean_efts_hits(await self._async_run_efts_query(
|
127
|
+
filing_date=(today_date, today_date),
|
128
|
+
quiet=quiet
|
129
|
+
))
|
130
|
+
|
131
|
+
new_hits = self._filter_new_accessions(hits)
|
132
|
+
if new_hits:
|
133
|
+
if not quiet:
|
134
|
+
print(f"Found {len(new_hits)} new submissions via EFTS validation")
|
135
|
+
if data_callback:
|
136
|
+
data_callback(new_hits)
|
137
|
+
last_polling_time = time.time()
|
138
|
+
last_validation_time = current_time
|
139
|
+
|
140
|
+
# Interval callback
|
141
|
+
if interval_callback:
|
142
|
+
interval_callback()
|
143
|
+
|
144
|
+
next_poll_time = last_polling_time + (polling_interval / 1000)
|
145
|
+
current_time = time.time()
|
146
|
+
time_to_sleep = max(0, next_poll_time - current_time)
|
147
|
+
await asyncio.sleep(time_to_sleep)
|
148
|
+
last_polling_time = next_poll_time
|
149
|
+
|
150
|
+
|
151
|
+
def monitor_submissions(self, data_callback=None, interval_callback=None,
|
152
|
+
polling_interval=1000, quiet=True, start_date=None,
|
153
|
+
validation_interval=60000):
|
154
|
+
"""
|
155
|
+
Monitor SEC submissions using the EDGAR system.
|
156
|
+
:param data_callback: function to call with the data
|
157
|
+
:param interval_callback: function that executes between polls
|
158
|
+
:param polling_interval: interval between polls in milliseconds
|
159
|
+
:param quiet: if True, suppresses output
|
160
|
+
:param start_date: backfill start date in YYYY-MM-DD format
|
161
|
+
:param validation_interval: interval between validation in milliseconds
|
162
|
+
|
163
|
+
This function combines the speed of the RSS feed (fast, but misses some submissions) with the accuracy of the EFTS system.
|
164
|
+
"""
|
165
|
+
# This is now a synchronous wrapper around the async implementation
|
166
|
+
return asyncio.run(self._async_monitor_submissions(
|
167
|
+
data_callback=data_callback,
|
168
|
+
interval_callback=interval_callback,
|
169
|
+
polling_interval=polling_interval,
|
170
|
+
quiet=quiet,
|
171
|
+
start_date=start_date,
|
172
|
+
validation_interval=validation_interval
|
173
|
+
))
|
174
|
+
|
175
|
+
def _filter_new_accessions(self, items):
|
176
|
+
"""Filter items to only include those with new accession numbers."""
|
177
|
+
new_items = []
|
178
|
+
for item in items:
|
179
|
+
accession = item['accession']
|
180
|
+
if accession not in self.accessions:
|
181
|
+
self.accessions.append(accession)
|
182
|
+
new_items.append(item)
|
183
|
+
return new_items
|
@@ -2,7 +2,7 @@ import asyncio
|
|
2
2
|
import aiohttp
|
3
3
|
import json
|
4
4
|
from tqdm import tqdm
|
5
|
-
from ..utils import PreciseRateLimiter, RateMonitor,
|
5
|
+
from ..utils import PreciseRateLimiter, RateMonitor, headers
|
6
6
|
|
7
7
|
async def fetch_company_facts(session, cik, rate_limiter, rate_monitor, pbar):
|
8
8
|
# Format CIK with leading zeros to 10 digits
|
@@ -1,7 +1,6 @@
|
|
1
1
|
import os
|
2
2
|
import asyncio
|
3
3
|
import aiohttp
|
4
|
-
from pathlib import Path
|
5
4
|
from tqdm import tqdm
|
6
5
|
import time
|
7
6
|
import shutil
|
@@ -13,11 +12,12 @@ from concurrent.futures import ThreadPoolExecutor
|
|
13
12
|
from functools import partial
|
14
13
|
from queue import Queue, Empty
|
15
14
|
from threading import Thread
|
16
|
-
from secsgml import parse_sgml_submission
|
17
15
|
from .query import query
|
18
16
|
from os import cpu_count
|
19
17
|
from ..submission import Submission
|
20
18
|
|
19
|
+
|
20
|
+
|
21
21
|
class Downloader:
|
22
22
|
def __init__(self, api_key=None):
|
23
23
|
self.BASE_URL = "https://library.datamule.xyz/original/nc/"
|