datamule 1.5.9__py3-none-any.whl → 1.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document/document.py +14 -1
- datamule/document/mappings_new/__init__.py +0 -0
- datamule/document/mappings_new/mappings.py +13 -0
- datamule/document/mappings_new/ownership.py +174 -0
- datamule/sec/submissions/monitor.py +115 -75
- datamule/submission.py +2 -1
- datamule/utils/__init__.py +0 -0
- datamule/utils/construct_submissions_data.py +150 -0
- {datamule-1.5.9.dist-info → datamule-1.6.1.dist-info}/METADATA +2 -1
- {datamule-1.5.9.dist-info → datamule-1.6.1.dist-info}/RECORD +12 -7
- {datamule-1.5.9.dist-info → datamule-1.6.1.dist-info}/WHEEL +0 -0
- {datamule-1.5.9.dist-info → datamule-1.6.1.dist-info}/top_level.txt +0 -0
datamule/document/document.py
CHANGED
@@ -12,6 +12,7 @@ from .processing import process_tabular_data
|
|
12
12
|
from pathlib import Path
|
13
13
|
import webbrowser
|
14
14
|
from secsgml.utils import bytes_to_str
|
15
|
+
from secxbrl import parse_inline_xbrl
|
15
16
|
|
16
17
|
class Document:
|
17
18
|
def __init__(self, type, content, extension,accession,filing_date,path=None):
|
@@ -33,6 +34,7 @@ class Document:
|
|
33
34
|
self.extension = extension
|
34
35
|
# this will be filled by parsed
|
35
36
|
self.data = None
|
37
|
+
self.xbrl = None
|
36
38
|
|
37
39
|
#_load_text_content
|
38
40
|
def _preprocess_txt_content(self):
|
@@ -101,12 +103,23 @@ class Document:
|
|
101
103
|
if self.extension in ['.htm', '.html', '.txt','.xml']:
|
102
104
|
return bool(re.search(pattern, self.content))
|
103
105
|
return False
|
106
|
+
|
107
|
+
def parse_xbrl(self,type='inline'):
|
108
|
+
if self.xbrl:
|
109
|
+
return
|
110
|
+
if type =='inline':
|
111
|
+
if self.extension not in ['.htm','.html']:
|
112
|
+
return
|
113
|
+
|
114
|
+
self.xbrl = parse_inline_xbrl(self.content)
|
115
|
+
else:
|
116
|
+
raise ValueError("Only inline has been implemented so far.")
|
104
117
|
|
105
118
|
# Note: this method will be heavily modified in the future
|
106
119
|
def parse(self):
|
107
120
|
# check if we have already parsed the content
|
108
121
|
if self.data:
|
109
|
-
return
|
122
|
+
return
|
110
123
|
|
111
124
|
mapping_dict = None
|
112
125
|
|
File without changes
|
@@ -0,0 +1,13 @@
|
|
1
|
+
import ownership
|
2
|
+
|
3
|
+
|
4
|
+
# key is document type
|
5
|
+
# note: this assumes XML format.
|
6
|
+
table_mappings = {
|
7
|
+
'3' : ownership.mappings,
|
8
|
+
'3/A' : ownership.mappings,
|
9
|
+
'4' : ownership.mappings,
|
10
|
+
'4/A' : ownership.mappings,
|
11
|
+
'5' : ownership.mappings,
|
12
|
+
'5/A' : ownership.mappings,
|
13
|
+
}
|
@@ -0,0 +1,174 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
|
4
|
+
# Non-derivative transaction ownership mapping
|
5
|
+
ownership_non_derivative_transactions_dict = {
|
6
|
+
'securityTitle_value': 'securityTitle',
|
7
|
+
'securityTitle_footnote': 'securityTitleFootnote',
|
8
|
+
'transactionDate_value': 'transactionDate',
|
9
|
+
'transactionDate_footnote': 'transactionDateFootnote',
|
10
|
+
'deemedExecutionDate_value': 'deemedExecutionDate',
|
11
|
+
'deemedExecutionDate_footnote': 'deemedExecutionDateFootnote',
|
12
|
+
'transactionCoding_transactionFormType': 'transactionFormType',
|
13
|
+
'transactionCoding_transactionCode': 'transactionCode',
|
14
|
+
'transactionCoding_equitySwapInvolved': 'equitySwapInvolved',
|
15
|
+
'transactionCoding_footnote': 'transactionCodingFootnote',
|
16
|
+
'transactionAmounts_transactionShares_value': 'transactionShares',
|
17
|
+
'transactionAmounts_transactionShares_footnote': 'transactionSharesFootnote',
|
18
|
+
'transactionAmounts_transactionPricePerShare_value': 'transactionPricePerShare',
|
19
|
+
'transactionAmounts_transactionPricePerShare_footnote': 'transactionPricePerShareFootnote',
|
20
|
+
'transactionAmounts_transactionAcquiredDisposedCode_value': 'transactionAcquiredDisposedCode',
|
21
|
+
'transactionAmounts_transactionAcquiredDisposedCode_footnote': 'transactionAcquiredDisposedCodeFootnote',
|
22
|
+
'postTransactionAmounts_sharesOwnedFollowingTransaction_value': 'sharesOwnedFollowingTransaction',
|
23
|
+
'postTransactionAmounts_sharesOwnedFollowingTransaction_footnote': 'sharesOwnedFollowingTransactionFootnote',
|
24
|
+
'ownershipNature_directOrIndirectOwnership_value': 'directOrIndirectOwnership',
|
25
|
+
'ownershipNature_directOrIndirectOwnership_footnote': 'directOrIndirectOwnershipFootnote',
|
26
|
+
'ownershipNature_natureOfOwnership_value': 'natureOfOwnership',
|
27
|
+
'ownershipNature_natureOfOwnership_footnote': 'natureOfOwnershipFootnote',
|
28
|
+
'transactionTimeliness_value': 'transactionTimeliness',
|
29
|
+
'transactionTimeliness_footnote': 'transactionTimelinessFootnote',
|
30
|
+
'postTransactionAmounts_valueOwnedFollowingTransaction_value': 'valueOwnedFollowingTransaction',
|
31
|
+
'postTransactionAmounts_valueOwnedFollowingTransaction_footnote': 'valueOwnedFollowingTransactionFootnote'
|
32
|
+
}
|
33
|
+
|
34
|
+
# Derivative transaction ownership mapping
|
35
|
+
derivative_transaction_ownership_dict = {
|
36
|
+
'securityTitle_value': 'securityTitle',
|
37
|
+
'securityTitle_footnote': 'securityTitleFootnote',
|
38
|
+
'conversionOrExercisePrice_value': 'conversionOrExercisePrice',
|
39
|
+
'conversionOrExercisePrice_footnote': 'conversionOrExercisePriceFootnote',
|
40
|
+
'transactionDate_value': 'transactionDate',
|
41
|
+
'transactionDate_footnote': 'transactionDateFootnote',
|
42
|
+
'deemedExecutionDate_value': 'deemedExecutionDate',
|
43
|
+
'deemedExecutionDate_footnote': 'deemedExecutionDateFootnote',
|
44
|
+
'transactionCoding_transactionFormType': 'transactionFormType',
|
45
|
+
'transactionCoding_transactionCode': 'transactionCode',
|
46
|
+
'transactionCoding_equitySwapInvolved': 'equitySwapInvolved',
|
47
|
+
'transactionCoding_footnote': 'transactionCodingFootnote',
|
48
|
+
'transactionAmounts_transactionShares_value': 'transactionShares',
|
49
|
+
'transactionAmounts_transactionShares_footnote': 'transactionSharesFootnote',
|
50
|
+
'transactionAmounts_transactionPricePerShare_value': 'transactionPricePerShare',
|
51
|
+
'transactionAmounts_transactionPricePerShare_footnote': 'transactionPricePerShareFootnote',
|
52
|
+
'transactionAmounts_transactionAcquiredDisposedCode_value': 'transactionAcquiredDisposedCode',
|
53
|
+
'transactionAmounts_transactionTotalValue_value': 'transactionTotalValue',
|
54
|
+
'transactionAmounts_transactionTotalValue_footnote': 'transactionTotalValueFootnote',
|
55
|
+
'exerciseDate_value': 'exerciseDate',
|
56
|
+
'exerciseDate_footnote': 'exerciseDateFootnote',
|
57
|
+
'expirationDate_value': 'expirationDate',
|
58
|
+
'expirationDate_footnote': 'expirationDateFootnote',
|
59
|
+
'underlyingSecurity_underlyingSecurityTitle_value': 'underlyingSecurityTitle',
|
60
|
+
'underlyingSecurity_underlyingSecurityTitle_footnote': 'underlyingSecurityTitleFootnote',
|
61
|
+
'underlyingSecurity_underlyingSecurityShares_value': 'underlyingSecurityShares',
|
62
|
+
'underlyingSecurity_underlyingSecurityShares_footnote': 'underlyingSecuritySharesFootnote',
|
63
|
+
'underlyingSecurity_underlyingSecurityValue_value': 'underlyingSecurityValue',
|
64
|
+
'postTransactionAmounts_sharesOwnedFollowingTransaction_value': 'sharesOwnedFollowingTransaction',
|
65
|
+
'postTransactionAmounts_sharesOwnedFollowingTransaction_footnote': 'sharesOwnedFollowingTransactionFootnote',
|
66
|
+
'ownershipNature_directOrIndirectOwnership_value': 'directOrIndirectOwnership',
|
67
|
+
'ownershipNature_directOrIndirectOwnership_footnote': 'directOrIndirectOwnershipFootnote',
|
68
|
+
'ownershipNature_natureOfOwnership_value': 'natureOfOwnership',
|
69
|
+
'ownershipNature_natureOfOwnership_footnote': 'natureOfOwnershipFootnote',
|
70
|
+
'transactionTimeliness_value': 'transactionTimeliness',
|
71
|
+
'transactionTimeliness_footnote': 'transactionTimelinessFootnote',
|
72
|
+
'postTransactionAmounts_valueOwnedFollowingTransaction_value': 'valueOwnedFollowingTransaction',
|
73
|
+
'postTransactionAmounts_valueOwnedFollowingTransaction_footnote': 'valueOwnedFollowingTransactionFootnote',
|
74
|
+
'transactionAmounts_transactionAcquiredDisposedCode_footnote': 'transactionAcquiredDisposedCodeFootnote',
|
75
|
+
'underlyingSecurity_underlyingSecurityValue_footnote': 'underlyingSecurityValueFootnote'
|
76
|
+
}
|
77
|
+
|
78
|
+
# Non-derivative holding ownership mapping
|
79
|
+
non_derivative_holding_ownership_dict = {
|
80
|
+
'securityTitle_value': 'securityTitle',
|
81
|
+
'securityTitle_footnote': 'securityTitleFootnote',
|
82
|
+
'postTransactionAmounts_sharesOwnedFollowingTransaction_value': 'sharesOwnedFollowingTransaction',
|
83
|
+
'postTransactionAmounts_sharesOwnedFollowingTransaction_footnote': 'sharesOwnedFollowingTransactionFootnote',
|
84
|
+
'ownershipNature_directOrIndirectOwnership_value': 'directOrIndirectOwnership',
|
85
|
+
'ownershipNature_directOrIndirectOwnership_footnote': 'directOrIndirectOwnershipFootnote',
|
86
|
+
'ownershipNature_natureOfOwnership_value': 'natureOfOwnership',
|
87
|
+
'ownershipNature_natureOfOwnership_footnote': 'natureOfOwnershipFootnote',
|
88
|
+
'postTransactionAmounts_valueOwnedFollowingTransaction_value': 'valueOwnedFollowingTransaction',
|
89
|
+
'transactionCoding_footnote': 'transactionCodingFootnote',
|
90
|
+
'transactionCoding_transactionFormType': 'transactionFormType',
|
91
|
+
'postTransactionAmounts_valueOwnedFollowingTransaction_footnote': 'valueOwnedFollowingTransactionFootnote'
|
92
|
+
}
|
93
|
+
|
94
|
+
# Derivative holding ownership mapping
|
95
|
+
derivative_holding_ownership_dict = {
|
96
|
+
'securityTitle_value': 'securityTitle',
|
97
|
+
'securityTitle_footnote': 'securityTitleFootnote',
|
98
|
+
'conversionOrExercisePrice_value': 'conversionOrExercisePrice',
|
99
|
+
'conversionOrExercisePrice_footnote': 'conversionOrExercisePriceFootnote',
|
100
|
+
'exerciseDate_value': 'exerciseDate',
|
101
|
+
'exerciseDate_footnote': 'exerciseDateFootnote',
|
102
|
+
'expirationDate_value': 'expirationDate',
|
103
|
+
'expirationDate_footnote': 'expirationDateFootnote',
|
104
|
+
'underlyingSecurity_underlyingSecurityTitle_value': 'underlyingSecurityTitle',
|
105
|
+
'underlyingSecurity_underlyingSecurityTitle_footnote': 'underlyingSecurityTitleFootnote',
|
106
|
+
'underlyingSecurity_underlyingSecurityShares_value': 'underlyingSecurityShares',
|
107
|
+
'underlyingSecurity_underlyingSecurityShares_footnote': 'underlyingSecuritySharesFootnote',
|
108
|
+
'underlyingSecurity_underlyingSecurityValue_value': 'underlyingSecurityValue',
|
109
|
+
'underlyingSecurity_underlyingSecurityValue_footnote': 'underlyingSecurityValueFootnote',
|
110
|
+
'ownershipNature_directOrIndirectOwnership_value': 'directOrIndirectOwnership',
|
111
|
+
'ownershipNature_directOrIndirectOwnership_footnote': 'directOrIndirectOwnershipFootnote',
|
112
|
+
'ownershipNature_natureOfOwnership_value': 'natureOfOwnership',
|
113
|
+
'ownershipNature_natureOfOwnership_footnote': 'natureOfOwnershipFootnote',
|
114
|
+
'postTransactionAmounts_sharesOwnedFollowingTransaction_value': 'sharesOwnedFollowingTransaction',
|
115
|
+
'postTransactionAmounts_sharesOwnedFollowingTransaction_footnote': 'sharesOwnedFollowingTransactionFootnote',
|
116
|
+
'postTransactionAmounts_valueOwnedFollowingTransaction_value': 'valueOwnedFollowingTransaction',
|
117
|
+
'postTransactionAmounts_valueOwnedFollowingTransaction_footnote': 'valueOwnedFollowingTransactionFootnote',
|
118
|
+
'transactionCoding_transactionFormType': 'transactionFormType',
|
119
|
+
'transactionCoding_footnote': 'transactionCodingFootnote'
|
120
|
+
}
|
121
|
+
|
122
|
+
# Reporting owner ownership mapping
|
123
|
+
reporting_owner_ownership_dict = {
|
124
|
+
'reportingOwnerAddress_rptOwnerCity': 'rptOwnerCity',
|
125
|
+
'reportingOwnerAddress_rptOwnerState': 'rptOwnerState',
|
126
|
+
'reportingOwnerAddress_rptOwnerStateDescription': 'rptOwnerStateDescription',
|
127
|
+
'reportingOwnerAddress_rptOwnerStreet1': 'rptOwnerStreet1',
|
128
|
+
'reportingOwnerAddress_rptOwnerStreet2': 'rptOwnerStreet2',
|
129
|
+
'reportingOwnerAddress_rptOwnerZipCode': 'rptOwnerZipCode',
|
130
|
+
'reportingOwnerId_rptOwnerCik': 'rptOwnerCik',
|
131
|
+
'reportingOwnerId_rptOwnerName': 'rptOwnerName',
|
132
|
+
'reportingOwnerRelationship_isDirector': 'rptOwnerIsDirector',
|
133
|
+
'reportingOwnerRelationship_isOfficer': 'rptOwnerIsOfficer',
|
134
|
+
'reportingOwnerRelationship_isTenPercentOwner': 'rptOwnerIsTenPercentOwner',
|
135
|
+
'reportingOwnerRelationship_isOther': 'rptOwnerIsOther',
|
136
|
+
'reportingOwnerRelationship_officerTitle': 'rptOwnerOfficerTitle',
|
137
|
+
'reportingOwnerRelationship_otherText': 'rptOwnerOtherText'
|
138
|
+
}
|
139
|
+
|
140
|
+
# Metadata ownership mapping
|
141
|
+
metadata_ownership_dict = {
|
142
|
+
'periodOfReport': 'periodOfReport',
|
143
|
+
'issuer_issuerCik': 'issuerCik',
|
144
|
+
'issuer_issuerName': 'issuerName',
|
145
|
+
'issuer_issuerTradingSymbol': 'issuerTradingSymbol',
|
146
|
+
'documentType': 'documentType',
|
147
|
+
'remarks': 'remarks',
|
148
|
+
'documentDescription': 'documentDescription',
|
149
|
+
'footnotes': 'footnotes',
|
150
|
+
'notSubjectToSection16': 'notSubjectToSection16',
|
151
|
+
'form3HoldingsReported': 'form3HoldingsReported',
|
152
|
+
'form4TransactionsReported': 'form4TransactionsReported',
|
153
|
+
'noSecuritiesOwned': 'noSecuritiesOwned',
|
154
|
+
'aff10b5One': 'aff10b5One',
|
155
|
+
'dateOfOriginalSubmission': 'dateOfOriginalSubmission',
|
156
|
+
'schemaVersion': 'schemaVersion'
|
157
|
+
}
|
158
|
+
|
159
|
+
# Owner signature ownership mapping
|
160
|
+
owner_signature_ownership_dict = {
|
161
|
+
'signatureName': 'signatureName',
|
162
|
+
'signatureDate': 'signatureDate'
|
163
|
+
}
|
164
|
+
|
165
|
+
|
166
|
+
mappings = {
|
167
|
+
'ownership_non_derivative_transactions' : ownership_non_derivative_transactions_dict,
|
168
|
+
'ownership_derivative_transactions' : derivative_transaction_ownership_dict,
|
169
|
+
'ownership_non_derivative_holdings' : non_derivative_holding_ownership_dict,
|
170
|
+
'ownership_derivative_holdings' : derivative_holding_ownership_dict,
|
171
|
+
'ownership_reporting_owner' : reporting_owner_ownership_dict,
|
172
|
+
'ownership_metadata' : metadata_ownership_dict,
|
173
|
+
'ownership_owner_signature' : owner_signature_ownership_dict
|
174
|
+
}
|
@@ -9,16 +9,14 @@ from .eftsquery import EFTSQuery
|
|
9
9
|
import aiohttp
|
10
10
|
from zoneinfo import ZoneInfo
|
11
11
|
|
12
|
-
async def poll_rss(limiter):
|
12
|
+
async def poll_rss(limiter, session):
|
13
13
|
base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
|
14
14
|
|
15
|
-
#
|
16
|
-
async with
|
17
|
-
# Use the
|
18
|
-
async with
|
19
|
-
|
20
|
-
async with session.get(base_url) as response:
|
21
|
-
content = await response.read()
|
15
|
+
# Use the rate limiter before making the request
|
16
|
+
async with limiter:
|
17
|
+
# Use the provided session instead of creating a new one
|
18
|
+
async with session.get(base_url) as response:
|
19
|
+
content = await response.read()
|
22
20
|
|
23
21
|
# Process the content
|
24
22
|
content_str = content.decode('utf-8')
|
@@ -70,12 +68,31 @@ class Monitor():
|
|
70
68
|
self.ratelimiters = {'sec.gov': PreciseRateLimiter(rate=5)}
|
71
69
|
self.efts_query = EFTSQuery(quiet=True)
|
72
70
|
self.efts_query.limiter = self.ratelimiters['sec.gov']
|
71
|
+
self.session = None
|
72
|
+
self.session_created_at = 0
|
73
|
+
self.session_lifetime = 300 # 5 minutes in seconds
|
73
74
|
|
74
75
|
def set_domain_rate_limit(self, domain, rate):
|
75
76
|
self.ratelimiters[domain] = PreciseRateLimiter(rate=rate)
|
76
77
|
if domain == 'sec.gov':
|
77
78
|
self.efts_query.limiter = self.ratelimiters[domain]
|
78
79
|
|
80
|
+
async def _ensure_fresh_session(self):
|
81
|
+
"""Ensure we have a fresh session, recreating if expired or missing"""
|
82
|
+
current_time = time.time()
|
83
|
+
|
84
|
+
# Check if we need a new session
|
85
|
+
if (self.session is None or
|
86
|
+
current_time - self.session_created_at > self.session_lifetime):
|
87
|
+
|
88
|
+
# Close old session if it exists
|
89
|
+
if self.session:
|
90
|
+
await self.session.close()
|
91
|
+
|
92
|
+
# Create new session
|
93
|
+
self.session = aiohttp.ClientSession(headers=headers)
|
94
|
+
self.session_created_at = current_time
|
95
|
+
|
79
96
|
async def _async_run_efts_query(self, **kwargs):
|
80
97
|
"""Async helper method to run EFTS query without creating a new event loop"""
|
81
98
|
# Make sure to set quiet parameter if provided in kwargs
|
@@ -103,83 +120,106 @@ class Monitor():
|
|
103
120
|
if polling_interval is None and validation_interval is None:
|
104
121
|
raise ValueError("At least one of polling_interval or validation_interval must be specified")
|
105
122
|
|
106
|
-
#
|
107
|
-
|
108
|
-
today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
|
109
|
-
if not quiet:
|
110
|
-
print(f"Backfilling from {start_date} to {today_date}")
|
111
|
-
|
112
|
-
hits = clean_efts_hits(await self._async_run_efts_query(
|
113
|
-
filing_date=(start_date, today_date),
|
114
|
-
quiet=quiet
|
115
|
-
))
|
116
|
-
|
117
|
-
new_hits = self._filter_new_accessions(hits)
|
118
|
-
if not quiet:
|
119
|
-
print(f"New submissions found: {len(new_hits)}")
|
120
|
-
if new_hits and data_callback:
|
121
|
-
data_callback(new_hits)
|
122
|
-
|
123
|
-
# Initialize timing variables
|
124
|
-
current_time = time.time()
|
125
|
-
last_polling_time = current_time
|
126
|
-
last_validation_time = current_time
|
127
|
-
|
128
|
-
# Determine which operations to perform
|
129
|
-
do_polling = polling_interval is not None
|
130
|
-
do_validation = validation_interval is not None
|
123
|
+
# Ensure we have a fresh session
|
124
|
+
await self._ensure_fresh_session()
|
131
125
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
# RSS polling (if enabled)
|
136
|
-
if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
|
137
|
-
if not quiet:
|
138
|
-
print(f"Polling RSS feed")
|
139
|
-
results = await poll_rss(self.ratelimiters['sec.gov'])
|
140
|
-
new_results = self._filter_new_accessions(results)
|
141
|
-
if new_results:
|
142
|
-
if not quiet:
|
143
|
-
print(f"Found {len(new_results)} new submissions via RSS")
|
144
|
-
if data_callback:
|
145
|
-
data_callback(new_results)
|
146
|
-
last_polling_time = current_time
|
147
|
-
|
148
|
-
# EFTS validation (if enabled)
|
149
|
-
if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
|
150
|
-
# Get submissions from the last 24 hours for validation
|
126
|
+
try:
|
127
|
+
# Backfill if start_date is provided
|
128
|
+
if start_date is not None:
|
151
129
|
today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
|
152
130
|
if not quiet:
|
153
|
-
print(f"
|
131
|
+
print(f"Backfilling from {start_date} to {today_date}")
|
154
132
|
|
155
133
|
hits = clean_efts_hits(await self._async_run_efts_query(
|
156
|
-
filing_date=(
|
134
|
+
filing_date=(start_date, today_date),
|
157
135
|
quiet=quiet
|
158
136
|
))
|
159
|
-
|
137
|
+
|
160
138
|
new_hits = self._filter_new_accessions(hits)
|
161
|
-
if
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
139
|
+
if not quiet:
|
140
|
+
print(f"New submissions found: {len(new_hits)}")
|
141
|
+
if new_hits and data_callback:
|
142
|
+
data_callback(new_hits)
|
143
|
+
|
144
|
+
# Initialize timing variables
|
145
|
+
current_time = time.time()
|
146
|
+
last_polling_time = current_time
|
147
|
+
last_validation_time = current_time
|
167
148
|
|
168
|
-
#
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
# Calculate next wake-up time
|
173
|
-
next_times = []
|
174
|
-
if do_polling:
|
175
|
-
next_times.append(last_polling_time + (polling_interval / 1000))
|
176
|
-
if do_validation:
|
177
|
-
next_times.append(last_validation_time + (validation_interval / 1000))
|
149
|
+
# Determine which operations to perform
|
150
|
+
do_polling = polling_interval is not None
|
151
|
+
do_validation = validation_interval is not None
|
178
152
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
153
|
+
while True:
|
154
|
+
current_time = time.time()
|
155
|
+
|
156
|
+
# RSS polling (if enabled)
|
157
|
+
if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
|
158
|
+
if not quiet:
|
159
|
+
print(f"Polling RSS feed")
|
160
|
+
|
161
|
+
# Ensure session is fresh before polling
|
162
|
+
await self._ensure_fresh_session()
|
163
|
+
|
164
|
+
try:
|
165
|
+
results = await poll_rss(self.ratelimiters['sec.gov'], self.session)
|
166
|
+
new_results = self._filter_new_accessions(results)
|
167
|
+
if new_results:
|
168
|
+
if not quiet:
|
169
|
+
print(f"Found {len(new_results)} new submissions via RSS")
|
170
|
+
if data_callback:
|
171
|
+
data_callback(new_results)
|
172
|
+
except Exception as e:
|
173
|
+
if not quiet:
|
174
|
+
print(f"RSS polling error: {e}, will recreate session on next poll")
|
175
|
+
# Force session recreation on next poll
|
176
|
+
if self.session:
|
177
|
+
await self.session.close()
|
178
|
+
self.session = None
|
179
|
+
|
180
|
+
last_polling_time = current_time
|
181
|
+
|
182
|
+
# EFTS validation (if enabled)
|
183
|
+
if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
|
184
|
+
# Get submissions from the last 24 hours for validation
|
185
|
+
today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
|
186
|
+
if not quiet:
|
187
|
+
print(f"Validating submissions from {today_date}")
|
188
|
+
|
189
|
+
hits = clean_efts_hits(await self._async_run_efts_query(
|
190
|
+
filing_date=(today_date, today_date),
|
191
|
+
quiet=quiet
|
192
|
+
))
|
193
|
+
|
194
|
+
new_hits = self._filter_new_accessions(hits)
|
195
|
+
if new_hits:
|
196
|
+
if not quiet:
|
197
|
+
print(f"Found {len(new_hits)} new submissions via EFTS validation")
|
198
|
+
if data_callback:
|
199
|
+
data_callback(new_hits)
|
200
|
+
last_validation_time = current_time
|
201
|
+
|
202
|
+
# Interval callback
|
203
|
+
if interval_callback:
|
204
|
+
interval_callback()
|
205
|
+
|
206
|
+
# Calculate next wake-up time
|
207
|
+
next_times = []
|
208
|
+
if do_polling:
|
209
|
+
next_times.append(last_polling_time + (polling_interval / 1000))
|
210
|
+
if do_validation:
|
211
|
+
next_times.append(last_validation_time + (validation_interval / 1000))
|
212
|
+
|
213
|
+
next_wake_time = min(next_times)
|
214
|
+
current_time = time.time()
|
215
|
+
time_to_sleep = max(0, next_wake_time - current_time)
|
216
|
+
await asyncio.sleep(time_to_sleep)
|
217
|
+
|
218
|
+
finally:
|
219
|
+
# Clean up the session when done
|
220
|
+
if self.session:
|
221
|
+
await self.session.close()
|
222
|
+
self.session = None
|
183
223
|
|
184
224
|
def monitor_submissions(self, data_callback=None, interval_callback=None,
|
185
225
|
polling_interval=1000, quiet=True, start_date=None,
|
datamule/submission.py
CHANGED
@@ -251,7 +251,8 @@ class Submission:
|
|
251
251
|
try:
|
252
252
|
content = tar.extractfile(filename+'.zst').read()
|
253
253
|
except:
|
254
|
-
|
254
|
+
# some of these issues are on SEC data end, will fix when I setup cloud.
|
255
|
+
raise ValueError(f"Something went wrong with tar: {self.path}")
|
255
256
|
# Decompress if compressed
|
256
257
|
if filename.endswith('.gz'):
|
257
258
|
content = gzip.decompress(content)
|
File without changes
|
@@ -0,0 +1,150 @@
|
|
1
|
+
import zipfile
|
2
|
+
import json
|
3
|
+
import csv
|
4
|
+
import os
|
5
|
+
import tempfile
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
7
|
+
import threading
|
8
|
+
from tqdm import tqdm
|
9
|
+
import urllib.request
|
10
|
+
|
11
|
+
headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}
|
12
|
+
|
13
|
+
def process_file_batch(zip_file, filenames_batch):
|
14
|
+
"""Process a batch of files from the zip archive"""
|
15
|
+
batch_filings = []
|
16
|
+
|
17
|
+
for filename in filenames_batch:
|
18
|
+
if not filename.startswith('CIK'):
|
19
|
+
continue
|
20
|
+
|
21
|
+
try:
|
22
|
+
# Extract CIK from filename
|
23
|
+
cik = int(filename.split('.')[0].split('-')[0][3:])
|
24
|
+
|
25
|
+
# Read raw bytes and parse JSON
|
26
|
+
with zip_file.open(filename) as file:
|
27
|
+
raw_data = file.read()
|
28
|
+
submissions_dct = json.loads(raw_data)
|
29
|
+
|
30
|
+
# Handle different file types
|
31
|
+
if 'submissions' in filename:
|
32
|
+
filings_data = submissions_dct
|
33
|
+
else:
|
34
|
+
filings_data = submissions_dct['filings']['recent']
|
35
|
+
|
36
|
+
# Extract required data
|
37
|
+
accession_numbers = filings_data['accessionNumber']
|
38
|
+
filing_dates = filings_data['filingDate']
|
39
|
+
forms = filings_data['form']
|
40
|
+
|
41
|
+
# Create filing records for this file
|
42
|
+
for j in range(len(accession_numbers)):
|
43
|
+
filing_record = {
|
44
|
+
'accessionNumber': accession_numbers[j],
|
45
|
+
'filingDate': filing_dates[j],
|
46
|
+
'form': forms[j],
|
47
|
+
'cik': cik
|
48
|
+
}
|
49
|
+
batch_filings.append(filing_record)
|
50
|
+
|
51
|
+
except Exception as e:
|
52
|
+
print(f"Error processing {filename}: {e}")
|
53
|
+
continue
|
54
|
+
|
55
|
+
return batch_filings
|
56
|
+
|
57
|
+
def write_csv_chunk(output_path, filings_data, is_first_write, write_lock):
|
58
|
+
"""Thread-safe CSV writing with lock"""
|
59
|
+
with write_lock:
|
60
|
+
if is_first_write:
|
61
|
+
with open(output_path, 'w', newline='') as csvfile:
|
62
|
+
fieldnames = ['accessionNumber', 'filingDate', 'form', 'cik']
|
63
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
64
|
+
writer.writeheader()
|
65
|
+
writer.writerows(filings_data)
|
66
|
+
else:
|
67
|
+
with open(output_path, 'a', newline='') as csvfile:
|
68
|
+
fieldnames = ['accessionNumber', 'filingDate', 'form', 'cik']
|
69
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
70
|
+
writer.writerows(filings_data)
|
71
|
+
|
72
|
+
def construct_submissions_data(output_path, submissions_zip_path=None, max_workers=4, batch_size=100):
|
73
|
+
"""Creates a list of dicts of every accession number, with filing date, submission type, and ciks"""
|
74
|
+
|
75
|
+
if submissions_zip_path is None:
|
76
|
+
url = "https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip"
|
77
|
+
|
78
|
+
temp_dir = tempfile.mkdtemp()
|
79
|
+
zip_path = os.path.join(temp_dir, 'submissions.zip')
|
80
|
+
|
81
|
+
req = urllib.request.Request(url, headers=headers)
|
82
|
+
|
83
|
+
with urllib.request.urlopen(req) as response:
|
84
|
+
total_size = int(response.headers.get('Content-Length', 0))
|
85
|
+
|
86
|
+
with open(zip_path, 'wb') as f, tqdm(
|
87
|
+
desc="Downloading",
|
88
|
+
total=total_size,
|
89
|
+
unit='B',
|
90
|
+
unit_scale=True,
|
91
|
+
unit_divisor=1024,
|
92
|
+
) as pbar:
|
93
|
+
while True:
|
94
|
+
chunk = response.read(8192)
|
95
|
+
if not chunk:
|
96
|
+
break
|
97
|
+
f.write(chunk)
|
98
|
+
pbar.update(len(chunk))
|
99
|
+
|
100
|
+
submissions_zip_path = zip_path
|
101
|
+
|
102
|
+
# Keep zip file open throughout processing
|
103
|
+
with zipfile.ZipFile(submissions_zip_path, 'r') as zip_file:
|
104
|
+
# Get all CIK filenames
|
105
|
+
all_filenames = [f for f in zip_file.namelist() if f.startswith('CIK')]
|
106
|
+
|
107
|
+
print(f"Processing {len(all_filenames)} files with {max_workers} workers...")
|
108
|
+
|
109
|
+
# Create batches of filenames
|
110
|
+
filename_batches = []
|
111
|
+
for i in range(0, len(all_filenames), batch_size):
|
112
|
+
batch = all_filenames[i:i + batch_size]
|
113
|
+
filename_batches.append(batch)
|
114
|
+
|
115
|
+
# Setup for threading
|
116
|
+
write_lock = threading.Lock()
|
117
|
+
total_filings = 0
|
118
|
+
is_first_write = True
|
119
|
+
|
120
|
+
# Process batches with thread pool
|
121
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
122
|
+
# Submit all batch jobs
|
123
|
+
future_to_batch = {
|
124
|
+
executor.submit(process_file_batch, zip_file, batch): i
|
125
|
+
for i, batch in enumerate(filename_batches)
|
126
|
+
}
|
127
|
+
|
128
|
+
# Process results with progress bar
|
129
|
+
with tqdm(total=len(filename_batches), desc="Processing batches", unit="batch") as pbar:
|
130
|
+
for future in future_to_batch:
|
131
|
+
try:
|
132
|
+
batch_filings = future.result()
|
133
|
+
|
134
|
+
if batch_filings: # Only write if we have data
|
135
|
+
write_csv_chunk(output_path, batch_filings, is_first_write, write_lock)
|
136
|
+
is_first_write = False
|
137
|
+
total_filings += len(batch_filings)
|
138
|
+
|
139
|
+
pbar.update(1)
|
140
|
+
pbar.set_postfix({
|
141
|
+
'filings': total_filings,
|
142
|
+
'files': len(filename_batches[future_to_batch[future]])
|
143
|
+
})
|
144
|
+
|
145
|
+
except Exception as e:
|
146
|
+
print(f"Error processing batch: {e}")
|
147
|
+
pbar.update(1)
|
148
|
+
|
149
|
+
print(f"Complete! Processed {total_filings} total filings")
|
150
|
+
print(f"Data saved to {output_path}")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.6.1
|
4
4
|
Summary: Work with SEC submissions at scale.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
@@ -15,6 +15,7 @@ Requires-Dist: selectolax
|
|
15
15
|
Requires-Dist: pytz
|
16
16
|
Requires-Dist: zstandard
|
17
17
|
Requires-Dist: doc2dict
|
18
|
+
Requires-Dist: secxbrl
|
18
19
|
Requires-Dist: secsgml
|
19
20
|
Requires-Dist: websocket-client
|
20
21
|
|
@@ -5,12 +5,12 @@ datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
|
5
5
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
6
6
|
datamule/portfolio.py,sha256=eF1eDSwIg-CI8ZmZAHRjCGU0UhuPN4ijxPB0YDT4s2o,8023
|
7
7
|
datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
|
8
|
-
datamule/submission.py,sha256=
|
8
|
+
datamule/submission.py,sha256=vAiYNas1YrWgm4Grw24peJbfSUVERySEko1zmdtG49s,13033
|
9
9
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
10
10
|
datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
datamule/datamule/sec_connector.py,sha256=T3edE7I-d4oHysqj7zYlIOxH3Fuauj9tfw39UdFWvB8,2393
|
12
12
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
datamule/document/document.py,sha256=
|
13
|
+
datamule/document/document.py,sha256=YGo-Iz_sBXekUeKEAoNJV0BiLDtSOgD9OXFo2FocYq8,14439
|
14
14
|
datamule/document/processing.py,sha256=jDCEzBFDSQtq7nQxRScIsbALnFcvMPOkNkMUCa7mFxg,31921
|
15
15
|
datamule/document/table.py,sha256=73yUJKY82ap32jhLmZeTti-jQ_lyhcJGlGwyxLtgYOg,12944
|
16
16
|
datamule/document/mappings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -37,6 +37,9 @@ datamule/document/mappings/ta.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
37
37
|
datamule/document/mappings/thirteenfhr.py,sha256=XpYRIMPZnGLfEE4TqBI0BPXbyuq0xf3hut1fePOF6kU,4250
|
38
38
|
datamule/document/mappings/twentyfivense.py,sha256=lKyj0ZBhkHX9gQJMTUPrQlxYFg3k-aBnWqtoS5bujZM,905
|
39
39
|
datamule/document/mappings/twentyfourf2nt.py,sha256=Q7RPT3JgJHjYdjMuaSyAxclt6QPT_LgCQloxp-ByDuI,4118
|
40
|
+
datamule/document/mappings_new/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
|
+
datamule/document/mappings_new/mappings.py,sha256=sP94GK3-klMCTD6XFajAP9KxJ7Wq5YMMaXcHx1rQEKA,281
|
42
|
+
datamule/document/mappings_new/ownership.py,sha256=GVtyROefvEC_X5l6kayvZv57-kHxj8bHckAru8JtFOQ,10656
|
40
43
|
datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
44
|
datamule/mapping_dicts/html_mapping_dicts.py,sha256=G2PWB__FNg4VH9iFJFkflM0u-qOEtk67IWtGoqesb0k,5388
|
42
45
|
datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
|
@@ -48,7 +51,7 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
|
|
48
51
|
datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
49
52
|
datamule/sec/submissions/downloader.py,sha256=zGS0oJJI8tVF_GnVpZm20MymdYxnjrEjQioSVggw7Ck,1486
|
50
53
|
datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
|
51
|
-
datamule/sec/submissions/monitor.py,sha256=
|
54
|
+
datamule/sec/submissions/monitor.py,sha256=1JUMRYsTqtd31hX3UrUA_aXFUmZN6n-V7h0i1gavNOs,11395
|
52
55
|
datamule/sec/submissions/streamer.py,sha256=Qydj40CmWB_wsPv2dibefRohmCokegG2pR7iZ9C3xLQ,11584
|
53
56
|
datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
|
54
57
|
datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -60,7 +63,9 @@ datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
|
|
60
63
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
61
64
|
datamule/seclibrary/downloader.py,sha256=ylv69VF22IVfrdeCkiGr5mVa2GKrPC9zFiDJU1fiBu8,17262
|
62
65
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
63
|
-
datamule
|
64
|
-
datamule
|
65
|
-
datamule-1.
|
66
|
-
datamule-1.
|
66
|
+
datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
67
|
+
datamule/utils/construct_submissions_data.py,sha256=Jn37Ra2_nCIalATCjP_484eUiFP_YeglX_uNdK4Qfu8,5883
|
68
|
+
datamule-1.6.1.dist-info/METADATA,sha256=0SEtRwvbaGgU-x_D8u3n0MUPYLssODtQf4GhQrGfl7s,524
|
69
|
+
datamule-1.6.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
70
|
+
datamule-1.6.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
71
|
+
datamule-1.6.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|