datamule 1.5.9__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,7 @@ from .processing import process_tabular_data
12
12
  from pathlib import Path
13
13
  import webbrowser
14
14
  from secsgml.utils import bytes_to_str
15
+ from secxbrl import parse_inline_xbrl
15
16
 
16
17
  class Document:
17
18
  def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -33,6 +34,7 @@ class Document:
33
34
  self.extension = extension
34
35
  # this will be filled by parsed
35
36
  self.data = None
37
+ self.xbrl = None
36
38
 
37
39
  #_load_text_content
38
40
  def _preprocess_txt_content(self):
@@ -101,12 +103,23 @@ class Document:
101
103
  if self.extension in ['.htm', '.html', '.txt','.xml']:
102
104
  return bool(re.search(pattern, self.content))
103
105
  return False
106
+
107
+ def parse_xbrl(self,type='inline'):
108
+ if self.xbrl:
109
+ return
110
+ if type =='inline':
111
+ if self.extension not in ['.htm','.html']:
112
+ return
113
+
114
+ self.xbrl = parse_inline_xbrl(self.content)
115
+ else:
116
+ raise ValueError("Only inline has been implemented so far.")
104
117
 
105
118
  # Note: this method will be heavily modified in the future
106
119
  def parse(self):
107
120
  # check if we have already parsed the content
108
121
  if self.data:
109
- return self.data
122
+ return
110
123
 
111
124
  mapping_dict = None
112
125
 
File without changes
@@ -0,0 +1,13 @@
1
+ import ownership
2
+
3
+
4
+ # key is document type
5
+ # note: this assumes XML format.
6
+ table_mappings = {
7
+ '3' : ownership.mappings,
8
+ '3/A' : ownership.mappings,
9
+ '4' : ownership.mappings,
10
+ '4/A' : ownership.mappings,
11
+ '5' : ownership.mappings,
12
+ '5/A' : ownership.mappings,
13
+ }
@@ -0,0 +1,174 @@
1
+
2
+
3
+
4
+ # Non-derivative transaction ownership mapping
5
+ ownership_non_derivative_transactions_dict = {
6
+ 'securityTitle_value': 'securityTitle',
7
+ 'securityTitle_footnote': 'securityTitleFootnote',
8
+ 'transactionDate_value': 'transactionDate',
9
+ 'transactionDate_footnote': 'transactionDateFootnote',
10
+ 'deemedExecutionDate_value': 'deemedExecutionDate',
11
+ 'deemedExecutionDate_footnote': 'deemedExecutionDateFootnote',
12
+ 'transactionCoding_transactionFormType': 'transactionFormType',
13
+ 'transactionCoding_transactionCode': 'transactionCode',
14
+ 'transactionCoding_equitySwapInvolved': 'equitySwapInvolved',
15
+ 'transactionCoding_footnote': 'transactionCodingFootnote',
16
+ 'transactionAmounts_transactionShares_value': 'transactionShares',
17
+ 'transactionAmounts_transactionShares_footnote': 'transactionSharesFootnote',
18
+ 'transactionAmounts_transactionPricePerShare_value': 'transactionPricePerShare',
19
+ 'transactionAmounts_transactionPricePerShare_footnote': 'transactionPricePerShareFootnote',
20
+ 'transactionAmounts_transactionAcquiredDisposedCode_value': 'transactionAcquiredDisposedCode',
21
+ 'transactionAmounts_transactionAcquiredDisposedCode_footnote': 'transactionAcquiredDisposedCodeFootnote',
22
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_value': 'sharesOwnedFollowingTransaction',
23
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_footnote': 'sharesOwnedFollowingTransactionFootnote',
24
+ 'ownershipNature_directOrIndirectOwnership_value': 'directOrIndirectOwnership',
25
+ 'ownershipNature_directOrIndirectOwnership_footnote': 'directOrIndirectOwnershipFootnote',
26
+ 'ownershipNature_natureOfOwnership_value': 'natureOfOwnership',
27
+ 'ownershipNature_natureOfOwnership_footnote': 'natureOfOwnershipFootnote',
28
+ 'transactionTimeliness_value': 'transactionTimeliness',
29
+ 'transactionTimeliness_footnote': 'transactionTimelinessFootnote',
30
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_value': 'valueOwnedFollowingTransaction',
31
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_footnote': 'valueOwnedFollowingTransactionFootnote'
32
+ }
33
+
34
+ # Derivative transaction ownership mapping
35
+ derivative_transaction_ownership_dict = {
36
+ 'securityTitle_value': 'securityTitle',
37
+ 'securityTitle_footnote': 'securityTitleFootnote',
38
+ 'conversionOrExercisePrice_value': 'conversionOrExercisePrice',
39
+ 'conversionOrExercisePrice_footnote': 'conversionOrExercisePriceFootnote',
40
+ 'transactionDate_value': 'transactionDate',
41
+ 'transactionDate_footnote': 'transactionDateFootnote',
42
+ 'deemedExecutionDate_value': 'deemedExecutionDate',
43
+ 'deemedExecutionDate_footnote': 'deemedExecutionDateFootnote',
44
+ 'transactionCoding_transactionFormType': 'transactionFormType',
45
+ 'transactionCoding_transactionCode': 'transactionCode',
46
+ 'transactionCoding_equitySwapInvolved': 'equitySwapInvolved',
47
+ 'transactionCoding_footnote': 'transactionCodingFootnote',
48
+ 'transactionAmounts_transactionShares_value': 'transactionShares',
49
+ 'transactionAmounts_transactionShares_footnote': 'transactionSharesFootnote',
50
+ 'transactionAmounts_transactionPricePerShare_value': 'transactionPricePerShare',
51
+ 'transactionAmounts_transactionPricePerShare_footnote': 'transactionPricePerShareFootnote',
52
+ 'transactionAmounts_transactionAcquiredDisposedCode_value': 'transactionAcquiredDisposedCode',
53
+ 'transactionAmounts_transactionTotalValue_value': 'transactionTotalValue',
54
+ 'transactionAmounts_transactionTotalValue_footnote': 'transactionTotalValueFootnote',
55
+ 'exerciseDate_value': 'exerciseDate',
56
+ 'exerciseDate_footnote': 'exerciseDateFootnote',
57
+ 'expirationDate_value': 'expirationDate',
58
+ 'expirationDate_footnote': 'expirationDateFootnote',
59
+ 'underlyingSecurity_underlyingSecurityTitle_value': 'underlyingSecurityTitle',
60
+ 'underlyingSecurity_underlyingSecurityTitle_footnote': 'underlyingSecurityTitleFootnote',
61
+ 'underlyingSecurity_underlyingSecurityShares_value': 'underlyingSecurityShares',
62
+ 'underlyingSecurity_underlyingSecurityShares_footnote': 'underlyingSecuritySharesFootnote',
63
+ 'underlyingSecurity_underlyingSecurityValue_value': 'underlyingSecurityValue',
64
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_value': 'sharesOwnedFollowingTransaction',
65
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_footnote': 'sharesOwnedFollowingTransactionFootnote',
66
+ 'ownershipNature_directOrIndirectOwnership_value': 'directOrIndirectOwnership',
67
+ 'ownershipNature_directOrIndirectOwnership_footnote': 'directOrIndirectOwnershipFootnote',
68
+ 'ownershipNature_natureOfOwnership_value': 'natureOfOwnership',
69
+ 'ownershipNature_natureOfOwnership_footnote': 'natureOfOwnershipFootnote',
70
+ 'transactionTimeliness_value': 'transactionTimeliness',
71
+ 'transactionTimeliness_footnote': 'transactionTimelinessFootnote',
72
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_value': 'valueOwnedFollowingTransaction',
73
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_footnote': 'valueOwnedFollowingTransactionFootnote',
74
+ 'transactionAmounts_transactionAcquiredDisposedCode_footnote': 'transactionAcquiredDisposedCodeFootnote',
75
+ 'underlyingSecurity_underlyingSecurityValue_footnote': 'underlyingSecurityValueFootnote'
76
+ }
77
+
78
+ # Non-derivative holding ownership mapping
79
+ non_derivative_holding_ownership_dict = {
80
+ 'securityTitle_value': 'securityTitle',
81
+ 'securityTitle_footnote': 'securityTitleFootnote',
82
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_value': 'sharesOwnedFollowingTransaction',
83
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_footnote': 'sharesOwnedFollowingTransactionFootnote',
84
+ 'ownershipNature_directOrIndirectOwnership_value': 'directOrIndirectOwnership',
85
+ 'ownershipNature_directOrIndirectOwnership_footnote': 'directOrIndirectOwnershipFootnote',
86
+ 'ownershipNature_natureOfOwnership_value': 'natureOfOwnership',
87
+ 'ownershipNature_natureOfOwnership_footnote': 'natureOfOwnershipFootnote',
88
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_value': 'valueOwnedFollowingTransaction',
89
+ 'transactionCoding_footnote': 'transactionCodingFootnote',
90
+ 'transactionCoding_transactionFormType': 'transactionFormType',
91
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_footnote': 'valueOwnedFollowingTransactionFootnote'
92
+ }
93
+
94
+ # Derivative holding ownership mapping
95
+ derivative_holding_ownership_dict = {
96
+ 'securityTitle_value': 'securityTitle',
97
+ 'securityTitle_footnote': 'securityTitleFootnote',
98
+ 'conversionOrExercisePrice_value': 'conversionOrExercisePrice',
99
+ 'conversionOrExercisePrice_footnote': 'conversionOrExercisePriceFootnote',
100
+ 'exerciseDate_value': 'exerciseDate',
101
+ 'exerciseDate_footnote': 'exerciseDateFootnote',
102
+ 'expirationDate_value': 'expirationDate',
103
+ 'expirationDate_footnote': 'expirationDateFootnote',
104
+ 'underlyingSecurity_underlyingSecurityTitle_value': 'underlyingSecurityTitle',
105
+ 'underlyingSecurity_underlyingSecurityTitle_footnote': 'underlyingSecurityTitleFootnote',
106
+ 'underlyingSecurity_underlyingSecurityShares_value': 'underlyingSecurityShares',
107
+ 'underlyingSecurity_underlyingSecurityShares_footnote': 'underlyingSecuritySharesFootnote',
108
+ 'underlyingSecurity_underlyingSecurityValue_value': 'underlyingSecurityValue',
109
+ 'underlyingSecurity_underlyingSecurityValue_footnote': 'underlyingSecurityValueFootnote',
110
+ 'ownershipNature_directOrIndirectOwnership_value': 'directOrIndirectOwnership',
111
+ 'ownershipNature_directOrIndirectOwnership_footnote': 'directOrIndirectOwnershipFootnote',
112
+ 'ownershipNature_natureOfOwnership_value': 'natureOfOwnership',
113
+ 'ownershipNature_natureOfOwnership_footnote': 'natureOfOwnershipFootnote',
114
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_value': 'sharesOwnedFollowingTransaction',
115
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_footnote': 'sharesOwnedFollowingTransactionFootnote',
116
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_value': 'valueOwnedFollowingTransaction',
117
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_footnote': 'valueOwnedFollowingTransactionFootnote',
118
+ 'transactionCoding_transactionFormType': 'transactionFormType',
119
+ 'transactionCoding_footnote': 'transactionCodingFootnote'
120
+ }
121
+
122
+ # Reporting owner ownership mapping
123
+ reporting_owner_ownership_dict = {
124
+ 'reportingOwnerAddress_rptOwnerCity': 'rptOwnerCity',
125
+ 'reportingOwnerAddress_rptOwnerState': 'rptOwnerState',
126
+ 'reportingOwnerAddress_rptOwnerStateDescription': 'rptOwnerStateDescription',
127
+ 'reportingOwnerAddress_rptOwnerStreet1': 'rptOwnerStreet1',
128
+ 'reportingOwnerAddress_rptOwnerStreet2': 'rptOwnerStreet2',
129
+ 'reportingOwnerAddress_rptOwnerZipCode': 'rptOwnerZipCode',
130
+ 'reportingOwnerId_rptOwnerCik': 'rptOwnerCik',
131
+ 'reportingOwnerId_rptOwnerName': 'rptOwnerName',
132
+ 'reportingOwnerRelationship_isDirector': 'rptOwnerIsDirector',
133
+ 'reportingOwnerRelationship_isOfficer': 'rptOwnerIsOfficer',
134
+ 'reportingOwnerRelationship_isTenPercentOwner': 'rptOwnerIsTenPercentOwner',
135
+ 'reportingOwnerRelationship_isOther': 'rptOwnerIsOther',
136
+ 'reportingOwnerRelationship_officerTitle': 'rptOwnerOfficerTitle',
137
+ 'reportingOwnerRelationship_otherText': 'rptOwnerOtherText'
138
+ }
139
+
140
+ # Metadata ownership mapping
141
+ metadata_ownership_dict = {
142
+ 'periodOfReport': 'periodOfReport',
143
+ 'issuer_issuerCik': 'issuerCik',
144
+ 'issuer_issuerName': 'issuerName',
145
+ 'issuer_issuerTradingSymbol': 'issuerTradingSymbol',
146
+ 'documentType': 'documentType',
147
+ 'remarks': 'remarks',
148
+ 'documentDescription': 'documentDescription',
149
+ 'footnotes': 'footnotes',
150
+ 'notSubjectToSection16': 'notSubjectToSection16',
151
+ 'form3HoldingsReported': 'form3HoldingsReported',
152
+ 'form4TransactionsReported': 'form4TransactionsReported',
153
+ 'noSecuritiesOwned': 'noSecuritiesOwned',
154
+ 'aff10b5One': 'aff10b5One',
155
+ 'dateOfOriginalSubmission': 'dateOfOriginalSubmission',
156
+ 'schemaVersion': 'schemaVersion'
157
+ }
158
+
159
+ # Owner signature ownership mapping
160
+ owner_signature_ownership_dict = {
161
+ 'signatureName': 'signatureName',
162
+ 'signatureDate': 'signatureDate'
163
+ }
164
+
165
+
166
+ mappings = {
167
+ 'ownership_non_derivative_transactions' : ownership_non_derivative_transactions_dict,
168
+ 'ownership_derivative_transactions' : derivative_transaction_ownership_dict,
169
+ 'ownership_non_derivative_holdings' : non_derivative_holding_ownership_dict,
170
+ 'ownership_derivative_holdings' : derivative_holding_ownership_dict,
171
+ 'ownership_reporting_owner' : reporting_owner_ownership_dict,
172
+ 'ownership_metadata' : metadata_ownership_dict,
173
+ 'ownership_owner_signature' : owner_signature_ownership_dict
174
+ }
@@ -9,16 +9,14 @@ from .eftsquery import EFTSQuery
9
9
  import aiohttp
10
10
  from zoneinfo import ZoneInfo
11
11
 
12
- async def poll_rss(limiter):
12
+ async def poll_rss(limiter, session):
13
13
  base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
14
14
 
15
- # Create a session specifically for this RSS polling operation
16
- async with aiohttp.ClientSession(headers=headers) as session:
17
- # Use the rate limiter before making the request
18
- async with limiter:
19
- # Make the HTTP request with the session
20
- async with session.get(base_url) as response:
21
- content = await response.read()
15
+ # Use the rate limiter before making the request
16
+ async with limiter:
17
+ # Use the provided session instead of creating a new one
18
+ async with session.get(base_url) as response:
19
+ content = await response.read()
22
20
 
23
21
  # Process the content
24
22
  content_str = content.decode('utf-8')
@@ -70,12 +68,31 @@ class Monitor():
70
68
  self.ratelimiters = {'sec.gov': PreciseRateLimiter(rate=5)}
71
69
  self.efts_query = EFTSQuery(quiet=True)
72
70
  self.efts_query.limiter = self.ratelimiters['sec.gov']
71
+ self.session = None
72
+ self.session_created_at = 0
73
+ self.session_lifetime = 300 # 5 minutes in seconds
73
74
 
74
75
  def set_domain_rate_limit(self, domain, rate):
75
76
  self.ratelimiters[domain] = PreciseRateLimiter(rate=rate)
76
77
  if domain == 'sec.gov':
77
78
  self.efts_query.limiter = self.ratelimiters[domain]
78
79
 
80
+ async def _ensure_fresh_session(self):
81
+ """Ensure we have a fresh session, recreating if expired or missing"""
82
+ current_time = time.time()
83
+
84
+ # Check if we need a new session
85
+ if (self.session is None or
86
+ current_time - self.session_created_at > self.session_lifetime):
87
+
88
+ # Close old session if it exists
89
+ if self.session:
90
+ await self.session.close()
91
+
92
+ # Create new session
93
+ self.session = aiohttp.ClientSession(headers=headers)
94
+ self.session_created_at = current_time
95
+
79
96
  async def _async_run_efts_query(self, **kwargs):
80
97
  """Async helper method to run EFTS query without creating a new event loop"""
81
98
  # Make sure to set quiet parameter if provided in kwargs
@@ -103,83 +120,106 @@ class Monitor():
103
120
  if polling_interval is None and validation_interval is None:
104
121
  raise ValueError("At least one of polling_interval or validation_interval must be specified")
105
122
 
106
- # Backfill if start_date is provided
107
- if start_date is not None:
108
- today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
109
- if not quiet:
110
- print(f"Backfilling from {start_date} to {today_date}")
111
-
112
- hits = clean_efts_hits(await self._async_run_efts_query(
113
- filing_date=(start_date, today_date),
114
- quiet=quiet
115
- ))
116
-
117
- new_hits = self._filter_new_accessions(hits)
118
- if not quiet:
119
- print(f"New submissions found: {len(new_hits)}")
120
- if new_hits and data_callback:
121
- data_callback(new_hits)
122
-
123
- # Initialize timing variables
124
- current_time = time.time()
125
- last_polling_time = current_time
126
- last_validation_time = current_time
127
-
128
- # Determine which operations to perform
129
- do_polling = polling_interval is not None
130
- do_validation = validation_interval is not None
123
+ # Ensure we have a fresh session
124
+ await self._ensure_fresh_session()
131
125
 
132
- while True:
133
- current_time = time.time()
134
-
135
- # RSS polling (if enabled)
136
- if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
137
- if not quiet:
138
- print(f"Polling RSS feed")
139
- results = await poll_rss(self.ratelimiters['sec.gov'])
140
- new_results = self._filter_new_accessions(results)
141
- if new_results:
142
- if not quiet:
143
- print(f"Found {len(new_results)} new submissions via RSS")
144
- if data_callback:
145
- data_callback(new_results)
146
- last_polling_time = current_time
147
-
148
- # EFTS validation (if enabled)
149
- if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
150
- # Get submissions from the last 24 hours for validation
126
+ try:
127
+ # Backfill if start_date is provided
128
+ if start_date is not None:
151
129
  today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
152
130
  if not quiet:
153
- print(f"Validating submissions from {today_date}")
131
+ print(f"Backfilling from {start_date} to {today_date}")
154
132
 
155
133
  hits = clean_efts_hits(await self._async_run_efts_query(
156
- filing_date=(today_date, today_date),
134
+ filing_date=(start_date, today_date),
157
135
  quiet=quiet
158
136
  ))
159
-
137
+
160
138
  new_hits = self._filter_new_accessions(hits)
161
- if new_hits:
162
- if not quiet:
163
- print(f"Found {len(new_hits)} new submissions via EFTS validation")
164
- if data_callback:
165
- data_callback(new_hits)
166
- last_validation_time = current_time
139
+ if not quiet:
140
+ print(f"New submissions found: {len(new_hits)}")
141
+ if new_hits and data_callback:
142
+ data_callback(new_hits)
143
+
144
+ # Initialize timing variables
145
+ current_time = time.time()
146
+ last_polling_time = current_time
147
+ last_validation_time = current_time
167
148
 
168
- # Interval callback
169
- if interval_callback:
170
- interval_callback()
171
-
172
- # Calculate next wake-up time
173
- next_times = []
174
- if do_polling:
175
- next_times.append(last_polling_time + (polling_interval / 1000))
176
- if do_validation:
177
- next_times.append(last_validation_time + (validation_interval / 1000))
149
+ # Determine which operations to perform
150
+ do_polling = polling_interval is not None
151
+ do_validation = validation_interval is not None
178
152
 
179
- next_wake_time = min(next_times)
180
- current_time = time.time()
181
- time_to_sleep = max(0, next_wake_time - current_time)
182
- await asyncio.sleep(time_to_sleep)
153
+ while True:
154
+ current_time = time.time()
155
+
156
+ # RSS polling (if enabled)
157
+ if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
158
+ if not quiet:
159
+ print(f"Polling RSS feed")
160
+
161
+ # Ensure session is fresh before polling
162
+ await self._ensure_fresh_session()
163
+
164
+ try:
165
+ results = await poll_rss(self.ratelimiters['sec.gov'], self.session)
166
+ new_results = self._filter_new_accessions(results)
167
+ if new_results:
168
+ if not quiet:
169
+ print(f"Found {len(new_results)} new submissions via RSS")
170
+ if data_callback:
171
+ data_callback(new_results)
172
+ except Exception as e:
173
+ if not quiet:
174
+ print(f"RSS polling error: {e}, will recreate session on next poll")
175
+ # Force session recreation on next poll
176
+ if self.session:
177
+ await self.session.close()
178
+ self.session = None
179
+
180
+ last_polling_time = current_time
181
+
182
+ # EFTS validation (if enabled)
183
+ if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
184
+ # Get submissions from the last 24 hours for validation
185
+ today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
186
+ if not quiet:
187
+ print(f"Validating submissions from {today_date}")
188
+
189
+ hits = clean_efts_hits(await self._async_run_efts_query(
190
+ filing_date=(today_date, today_date),
191
+ quiet=quiet
192
+ ))
193
+
194
+ new_hits = self._filter_new_accessions(hits)
195
+ if new_hits:
196
+ if not quiet:
197
+ print(f"Found {len(new_hits)} new submissions via EFTS validation")
198
+ if data_callback:
199
+ data_callback(new_hits)
200
+ last_validation_time = current_time
201
+
202
+ # Interval callback
203
+ if interval_callback:
204
+ interval_callback()
205
+
206
+ # Calculate next wake-up time
207
+ next_times = []
208
+ if do_polling:
209
+ next_times.append(last_polling_time + (polling_interval / 1000))
210
+ if do_validation:
211
+ next_times.append(last_validation_time + (validation_interval / 1000))
212
+
213
+ next_wake_time = min(next_times)
214
+ current_time = time.time()
215
+ time_to_sleep = max(0, next_wake_time - current_time)
216
+ await asyncio.sleep(time_to_sleep)
217
+
218
+ finally:
219
+ # Clean up the session when done
220
+ if self.session:
221
+ await self.session.close()
222
+ self.session = None
183
223
 
184
224
  def monitor_submissions(self, data_callback=None, interval_callback=None,
185
225
  polling_interval=1000, quiet=True, start_date=None,
datamule/submission.py CHANGED
@@ -251,7 +251,8 @@ class Submission:
251
251
  try:
252
252
  content = tar.extractfile(filename+'.zst').read()
253
253
  except:
254
- raise ValueError("Something went wrong with tar")
254
+ # some of these issues are on SEC data end, will fix when I setup cloud.
255
+ raise ValueError(f"Something went wrong with tar: {self.path}")
255
256
  # Decompress if compressed
256
257
  if filename.endswith('.gz'):
257
258
  content = gzip.decompress(content)
File without changes
@@ -0,0 +1,150 @@
1
+ import zipfile
2
+ import json
3
+ import csv
4
+ import os
5
+ import tempfile
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ import threading
8
+ from tqdm import tqdm
9
+ import urllib.request
10
+
11
+ headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}
12
+
13
+ def process_file_batch(zip_file, filenames_batch):
14
+ """Process a batch of files from the zip archive"""
15
+ batch_filings = []
16
+
17
+ for filename in filenames_batch:
18
+ if not filename.startswith('CIK'):
19
+ continue
20
+
21
+ try:
22
+ # Extract CIK from filename
23
+ cik = int(filename.split('.')[0].split('-')[0][3:])
24
+
25
+ # Read raw bytes and parse JSON
26
+ with zip_file.open(filename) as file:
27
+ raw_data = file.read()
28
+ submissions_dct = json.loads(raw_data)
29
+
30
+ # Handle different file types
31
+ if 'submissions' in filename:
32
+ filings_data = submissions_dct
33
+ else:
34
+ filings_data = submissions_dct['filings']['recent']
35
+
36
+ # Extract required data
37
+ accession_numbers = filings_data['accessionNumber']
38
+ filing_dates = filings_data['filingDate']
39
+ forms = filings_data['form']
40
+
41
+ # Create filing records for this file
42
+ for j in range(len(accession_numbers)):
43
+ filing_record = {
44
+ 'accessionNumber': accession_numbers[j],
45
+ 'filingDate': filing_dates[j],
46
+ 'form': forms[j],
47
+ 'cik': cik
48
+ }
49
+ batch_filings.append(filing_record)
50
+
51
+ except Exception as e:
52
+ print(f"Error processing {filename}: {e}")
53
+ continue
54
+
55
+ return batch_filings
56
+
57
+ def write_csv_chunk(output_path, filings_data, is_first_write, write_lock):
58
+ """Thread-safe CSV writing with lock"""
59
+ with write_lock:
60
+ if is_first_write:
61
+ with open(output_path, 'w', newline='') as csvfile:
62
+ fieldnames = ['accessionNumber', 'filingDate', 'form', 'cik']
63
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
64
+ writer.writeheader()
65
+ writer.writerows(filings_data)
66
+ else:
67
+ with open(output_path, 'a', newline='') as csvfile:
68
+ fieldnames = ['accessionNumber', 'filingDate', 'form', 'cik']
69
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
70
+ writer.writerows(filings_data)
71
+
72
+ def construct_submissions_data(output_path, submissions_zip_path=None, max_workers=4, batch_size=100):
73
+ """Creates a list of dicts of every accession number, with filing date, submission type, and ciks"""
74
+
75
+ if submissions_zip_path is None:
76
+ url = "https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip"
77
+
78
+ temp_dir = tempfile.mkdtemp()
79
+ zip_path = os.path.join(temp_dir, 'submissions.zip')
80
+
81
+ req = urllib.request.Request(url, headers=headers)
82
+
83
+ with urllib.request.urlopen(req) as response:
84
+ total_size = int(response.headers.get('Content-Length', 0))
85
+
86
+ with open(zip_path, 'wb') as f, tqdm(
87
+ desc="Downloading",
88
+ total=total_size,
89
+ unit='B',
90
+ unit_scale=True,
91
+ unit_divisor=1024,
92
+ ) as pbar:
93
+ while True:
94
+ chunk = response.read(8192)
95
+ if not chunk:
96
+ break
97
+ f.write(chunk)
98
+ pbar.update(len(chunk))
99
+
100
+ submissions_zip_path = zip_path
101
+
102
+ # Keep zip file open throughout processing
103
+ with zipfile.ZipFile(submissions_zip_path, 'r') as zip_file:
104
+ # Get all CIK filenames
105
+ all_filenames = [f for f in zip_file.namelist() if f.startswith('CIK')]
106
+
107
+ print(f"Processing {len(all_filenames)} files with {max_workers} workers...")
108
+
109
+ # Create batches of filenames
110
+ filename_batches = []
111
+ for i in range(0, len(all_filenames), batch_size):
112
+ batch = all_filenames[i:i + batch_size]
113
+ filename_batches.append(batch)
114
+
115
+ # Setup for threading
116
+ write_lock = threading.Lock()
117
+ total_filings = 0
118
+ is_first_write = True
119
+
120
+ # Process batches with thread pool
121
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
122
+ # Submit all batch jobs
123
+ future_to_batch = {
124
+ executor.submit(process_file_batch, zip_file, batch): i
125
+ for i, batch in enumerate(filename_batches)
126
+ }
127
+
128
+ # Process results with progress bar
129
+ with tqdm(total=len(filename_batches), desc="Processing batches", unit="batch") as pbar:
130
+ for future in future_to_batch:
131
+ try:
132
+ batch_filings = future.result()
133
+
134
+ if batch_filings: # Only write if we have data
135
+ write_csv_chunk(output_path, batch_filings, is_first_write, write_lock)
136
+ is_first_write = False
137
+ total_filings += len(batch_filings)
138
+
139
+ pbar.update(1)
140
+ pbar.set_postfix({
141
+ 'filings': total_filings,
142
+ 'files': len(filename_batches[future_to_batch[future]])
143
+ })
144
+
145
+ except Exception as e:
146
+ print(f"Error processing batch: {e}")
147
+ pbar.update(1)
148
+
149
+ print(f"Complete! Processed {total_filings} total filings")
150
+ print(f"Data saved to {output_path}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.5.9
3
+ Version: 1.6.1
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -15,6 +15,7 @@ Requires-Dist: selectolax
15
15
  Requires-Dist: pytz
16
16
  Requires-Dist: zstandard
17
17
  Requires-Dist: doc2dict
18
+ Requires-Dist: secxbrl
18
19
  Requires-Dist: secsgml
19
20
  Requires-Dist: websocket-client
20
21
 
@@ -5,12 +5,12 @@ datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
5
5
  datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
6
6
  datamule/portfolio.py,sha256=eF1eDSwIg-CI8ZmZAHRjCGU0UhuPN4ijxPB0YDT4s2o,8023
7
7
  datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
8
- datamule/submission.py,sha256=6JIi-ayLL-jENVj6Q4IhmrYlAreJI7xBAHP_NYaDB6k,12918
8
+ datamule/submission.py,sha256=vAiYNas1YrWgm4Grw24peJbfSUVERySEko1zmdtG49s,13033
9
9
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
10
10
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  datamule/datamule/sec_connector.py,sha256=T3edE7I-d4oHysqj7zYlIOxH3Fuauj9tfw39UdFWvB8,2393
12
12
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- datamule/document/document.py,sha256=04Rivdphq0D1HEGIBjtl1LelJr-IyQU1qCMi8yNJajw,14038
13
+ datamule/document/document.py,sha256=YGo-Iz_sBXekUeKEAoNJV0BiLDtSOgD9OXFo2FocYq8,14439
14
14
  datamule/document/processing.py,sha256=jDCEzBFDSQtq7nQxRScIsbALnFcvMPOkNkMUCa7mFxg,31921
15
15
  datamule/document/table.py,sha256=73yUJKY82ap32jhLmZeTti-jQ_lyhcJGlGwyxLtgYOg,12944
16
16
  datamule/document/mappings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -37,6 +37,9 @@ datamule/document/mappings/ta.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
37
37
  datamule/document/mappings/thirteenfhr.py,sha256=XpYRIMPZnGLfEE4TqBI0BPXbyuq0xf3hut1fePOF6kU,4250
38
38
  datamule/document/mappings/twentyfivense.py,sha256=lKyj0ZBhkHX9gQJMTUPrQlxYFg3k-aBnWqtoS5bujZM,905
39
39
  datamule/document/mappings/twentyfourf2nt.py,sha256=Q7RPT3JgJHjYdjMuaSyAxclt6QPT_LgCQloxp-ByDuI,4118
40
+ datamule/document/mappings_new/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ datamule/document/mappings_new/mappings.py,sha256=sP94GK3-klMCTD6XFajAP9KxJ7Wq5YMMaXcHx1rQEKA,281
42
+ datamule/document/mappings_new/ownership.py,sha256=GVtyROefvEC_X5l6kayvZv57-kHxj8bHckAru8JtFOQ,10656
40
43
  datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
44
  datamule/mapping_dicts/html_mapping_dicts.py,sha256=G2PWB__FNg4VH9iFJFkflM0u-qOEtk67IWtGoqesb0k,5388
42
45
  datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
@@ -48,7 +51,7 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
48
51
  datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
52
  datamule/sec/submissions/downloader.py,sha256=zGS0oJJI8tVF_GnVpZm20MymdYxnjrEjQioSVggw7Ck,1486
50
53
  datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
51
- datamule/sec/submissions/monitor.py,sha256=ll0nfHzG8FI3bA8zVFrfsfZGnbt5qAD4rRZ4LG2SORY,9567
54
+ datamule/sec/submissions/monitor.py,sha256=1JUMRYsTqtd31hX3UrUA_aXFUmZN6n-V7h0i1gavNOs,11395
52
55
  datamule/sec/submissions/streamer.py,sha256=Qydj40CmWB_wsPv2dibefRohmCokegG2pR7iZ9C3xLQ,11584
53
56
  datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
54
57
  datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -60,7 +63,9 @@ datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
60
63
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
61
64
  datamule/seclibrary/downloader.py,sha256=ylv69VF22IVfrdeCkiGr5mVa2GKrPC9zFiDJU1fiBu8,17262
62
65
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
63
- datamule-1.5.9.dist-info/METADATA,sha256=DkoMbTIImVjWfEkqwfe7BBqCpkvBC8CFRRF5v7PKyco,501
64
- datamule-1.5.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
65
- datamule-1.5.9.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
66
- datamule-1.5.9.dist-info/RECORD,,
66
+ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
+ datamule/utils/construct_submissions_data.py,sha256=Jn37Ra2_nCIalATCjP_484eUiFP_YeglX_uNdK4Qfu8,5883
68
+ datamule-1.6.1.dist-info/METADATA,sha256=0SEtRwvbaGgU-x_D8u3n0MUPYLssODtQf4GhQrGfl7s,524
69
+ datamule-1.6.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
70
+ datamule-1.6.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
71
+ datamule-1.6.1.dist-info/RECORD,,