datamule 1.2.4__tar.gz → 1.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {datamule-1.2.4 → datamule-1.2.6}/PKG-INFO +1 -2
  2. {datamule-1.2.4 → datamule-1.2.6}/datamule/__init__.py +1 -0
  3. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/document.py +26 -13
  4. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/information_table.py +1 -0
  5. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/ownership.py +1 -1
  6. datamule-1.2.6/datamule/document/mappings/proxy_voting_record.py +17 -0
  7. datamule-1.2.6/datamule/document/mappings/submission_metadata.py +9 -0
  8. datamule-1.2.6/datamule/document/mappings/thirteenfhr.py +72 -0
  9. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/twentyfivense.py +1 -0
  10. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/processing.py +35 -5
  11. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/table.py +6 -1
  12. {datamule-1.2.4 → datamule-1.2.6}/datamule/helper.py +10 -1
  13. {datamule-1.2.4 → datamule-1.2.6}/datamule/index.py +8 -10
  14. {datamule-1.2.4 → datamule-1.2.6}/datamule/portfolio.py +17 -16
  15. datamule-1.2.6/datamule/sec/submissions/monitor.py +183 -0
  16. {datamule-1.2.4 → datamule-1.2.6}/datamule/sec/submissions/textsearch.py +0 -4
  17. {datamule-1.2.4 → datamule-1.2.6}/datamule/sec/xbrl/streamcompanyfacts.py +1 -1
  18. {datamule-1.2.4 → datamule-1.2.6}/datamule/seclibrary/downloader.py +2 -2
  19. {datamule-1.2.4 → datamule-1.2.6}/datamule/submission.py +80 -14
  20. {datamule-1.2.4 → datamule-1.2.6}/datamule.egg-info/PKG-INFO +1 -2
  21. {datamule-1.2.4 → datamule-1.2.6}/datamule.egg-info/SOURCES.txt +1 -2
  22. {datamule-1.2.4 → datamule-1.2.6}/datamule.egg-info/requires.txt +0 -1
  23. {datamule-1.2.4 → datamule-1.2.6}/setup.py +1 -2
  24. datamule-1.2.4/datamule/document/mappings/proxy_voting_record.py +0 -1
  25. datamule-1.2.4/datamule/document/mappings/thirteenfhr.py +0 -5
  26. datamule-1.2.4/datamule/sec/rss/monitor.py +0 -416
  27. datamule-1.2.4/datamule/sec/submissions/monitor.py +0 -130
  28. datamule-1.2.4/datamule/seclibrary/__init__.py +0 -0
  29. {datamule-1.2.4 → datamule-1.2.6}/datamule/config.py +0 -0
  30. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/__init__.py +0 -0
  31. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/__init__.py +0 -0
  32. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/atsn.py +0 -0
  33. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/cfportal.py +0 -0
  34. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/ex99a_sdr.py +0 -0
  35. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/ex99c_sdr.py +0 -0
  36. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/ex99g_sdr.py +0 -0
  37. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/ex99i_sdr.py +0 -0
  38. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/nmfp.py +0 -0
  39. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/npx.py +0 -0
  40. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/onefourtyfour.py +0 -0
  41. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/sbs.py +0 -0
  42. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/sbsef.py +0 -0
  43. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/schedule13.py +0 -0
  44. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/sdr.py +0 -0
  45. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/ta.py +0 -0
  46. {datamule-1.2.4 → datamule-1.2.6}/datamule/document/mappings/twentyfourf2nt.py +0 -0
  47. {datamule-1.2.4 → datamule-1.2.6}/datamule/mapping_dicts/__init__.py +0 -0
  48. {datamule-1.2.4 → datamule-1.2.6}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  49. {datamule-1.2.4 → datamule-1.2.6}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  50. {datamule-1.2.4 → datamule-1.2.6}/datamule/package_updater.py +0 -0
  51. {datamule-1.2.4 → datamule-1.2.6}/datamule/sec/__init__.py +0 -0
  52. {datamule-1.2.4 → datamule-1.2.6}/datamule/sec/infrastructure/__init__.py +0 -0
  53. {datamule-1.2.4 → datamule-1.2.6}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  54. {datamule-1.2.4/datamule/sec/rss → datamule-1.2.6/datamule/sec/submissions}/__init__.py +0 -0
  55. {datamule-1.2.4 → datamule-1.2.6}/datamule/sec/submissions/downloader.py +0 -0
  56. {datamule-1.2.4 → datamule-1.2.6}/datamule/sec/submissions/eftsquery.py +0 -0
  57. {datamule-1.2.4 → datamule-1.2.6}/datamule/sec/submissions/streamer.py +0 -0
  58. {datamule-1.2.4 → datamule-1.2.6}/datamule/sec/utils.py +0 -0
  59. {datamule-1.2.4/datamule/sec/submissions → datamule-1.2.6/datamule/sec/xbrl}/__init__.py +0 -0
  60. {datamule-1.2.4 → datamule-1.2.6}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  61. {datamule-1.2.4 → datamule-1.2.6}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  62. {datamule-1.2.4 → datamule-1.2.6}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  63. {datamule-1.2.4/datamule/sec/xbrl → datamule-1.2.6/datamule/seclibrary}/__init__.py +0 -0
  64. {datamule-1.2.4 → datamule-1.2.6}/datamule/seclibrary/bq.py +0 -0
  65. {datamule-1.2.4 → datamule-1.2.6}/datamule/seclibrary/query.py +0 -0
  66. {datamule-1.2.4 → datamule-1.2.6}/datamule/sheet.py +0 -0
  67. {datamule-1.2.4 → datamule-1.2.6}/datamule.egg-info/dependency_links.txt +0 -0
  68. {datamule-1.2.4 → datamule-1.2.6}/datamule.egg-info/top_level.txt +0 -0
  69. {datamule-1.2.4 → datamule-1.2.6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.2.4
3
+ Version: 1.2.6
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -10,7 +10,6 @@ Requires-Dist: tqdm
10
10
  Requires-Dist: requests
11
11
  Requires-Dist: nest_asyncio
12
12
  Requires-Dist: aiofiles
13
- Requires-Dist: polars
14
13
  Requires-Dist: setuptools
15
14
  Requires-Dist: selectolax
16
15
  Requires-Dist: pytz
@@ -8,6 +8,7 @@ from .index import Index
8
8
  from .package_updater import PackageUpdater
9
9
 
10
10
 
11
+
11
12
  # Keep the notebook environment setup
12
13
  def _is_notebook_env():
13
14
  """Check if the code is running in a Jupyter or Colab environment."""
@@ -118,10 +118,11 @@ class Document:
118
118
  # will deprecate this when we add html2dict
119
119
  elif self.extension in ['.htm', '.html','.txt']:
120
120
 
121
- if self.type == '10-K':
122
- mapping_dict = dict_10k
123
- elif self.type == '10-Q':
121
+
122
+ if self.type == '10-Q':
124
123
  mapping_dict = dict_10q
124
+ elif self.type == '10-K':
125
+ mapping_dict = dict_10k
125
126
  elif self.type == '8-K':
126
127
  mapping_dict = dict_8k
127
128
  elif self.type == 'SC 13D':
@@ -141,27 +142,39 @@ class Document:
141
142
  json.dump(self.data, f, indent=2)
142
143
 
143
144
  def to_tabular(self):
144
- if self.extension != '.xml':
145
+ if self.type == 'submission_metadata':
146
+ return process_tabular_data(self)
147
+ elif self.extension != '.xml':
145
148
  return []
146
- self.parse()
147
- return process_tabular_data(self)
149
+ else:
150
+ self.parse()
151
+ return process_tabular_data(self)
148
152
 
149
153
 
150
- def write_csv(self, output_folder, accession_number=None):
154
+ def write_csv(self, output_folder):
155
+ output_folder = Path(output_folder)
156
+ output_folder.mkdir(exist_ok=True)
151
157
 
152
- tables = self.to_tabular(accession_number)
158
+ tables = self.to_tabular()
153
159
 
154
160
  if not tables:
155
161
  return
156
162
 
157
163
  for table in tables:
158
164
  fieldnames = table.columns
159
- output_filename = Path(output_folder) / f"{table.type}.csv"
165
+ output_filename = output_folder / f"{table.type}.csv"
166
+
167
+ # Check if the file already exists
168
+ if output_filename.exists():
160
169
 
161
- with open(output_filename, 'w', newline='') as csvfile:
162
- writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
163
- writer.writeheader()
164
- writer.writerows(table.data)
170
+ with open(output_filename, 'a', newline='') as csvfile:
171
+ writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
172
+ writer.writerows(table.data)
173
+ else:
174
+ with open(output_filename, 'w', newline='') as csvfile:
175
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
176
+ writer.writeheader()
177
+ writer.writerows(table.data)
165
178
 
166
179
 
167
180
  def _document_to_section_text(self, document_data, parent_key=''):
@@ -1,3 +1,4 @@
1
+ # Ready for mass testing
1
2
 
2
3
  # Information Table (13F-HR Securities) mapping
3
4
  information_table_dict = {
@@ -1,4 +1,4 @@
1
- # Mapping dictionaries for SEC filing table types based on actual field occurrences
1
+ # Ready for mass testing
2
2
 
3
3
  # Non-derivative transaction ownership mapping
4
4
  non_derivative_transaction_ownership_dict = {
@@ -0,0 +1,17 @@
1
+ proxy_voting_record_dict = {
2
+ 'meetingDate': 'meetingDate',
3
+ 'accession': 'accessionNumber',
4
+ 'vote_voteRecord_managementRecommendation': 'managementRecommendation',
5
+ 'sharesVoted': 'sharesVoted', # Top-level sharesVoted
6
+ 'vote_voteRecord_howVoted': 'howVoted',
7
+ 'sharesOnLoan': 'sharesOnLoan',
8
+ 'cusip': 'cusip',
9
+ 'issuerName': 'issuerName',
10
+ 'voteCategories_voteCategory_categoryType': 'categoryType',
11
+ 'voteDescription': 'voteDescription',
12
+ 'voteManager_otherManagers_otherManager': 'otherManager',
13
+ 'vote_voteRecord_sharesVoted': 'recordSharesVoted', # To distinguish from top-level sharesVoted
14
+ 'isin': 'isin',
15
+ 'voteSource': 'voteSource',
16
+ 'voteSeries': 'voteSeries'
17
+ }
@@ -0,0 +1,9 @@
1
+ # Note: submission_metadata is my designation, not SEC for the header of the Submission tag
2
+
3
+ document_submission_metadata_dict = {
4
+ 'accession':'accession',
5
+ 'type':'type',
6
+ 'sequence' : 'sequence',
7
+ 'filename' : 'filename',
8
+ 'description':'description'
9
+ }
@@ -0,0 +1,72 @@
1
+ # Ready for mass testing
2
+
3
+ # 13F-HR (Institutional Investment Manager Holdings) mapping
4
+ thirteenfhr_dict = {
5
+ # Cover Page Mapping
6
+ 'formData_coverPage_reportCalendarOrQuarter': 'reportCalendarOrQuarter',
7
+ 'formData_coverPage_filingManager_name': 'filingManagerName',
8
+ 'formData_coverPage_filingManager_address_street1': 'filingManagerStreet1',
9
+ 'formData_coverPage_filingManager_address_street2': 'filingManagerStreet2',
10
+ 'formData_coverPage_filingManager_address_city': 'filingManagerCity',
11
+ 'formData_coverPage_filingManager_address_stateOrCountry': 'filingManagerStateOrCountry',
12
+ 'formData_coverPage_filingManager_address_zipCode': 'filingManagerZipCode',
13
+ 'formData_coverPage_crdNumber': 'crdNumber',
14
+ 'formData_coverPage_secFileNumber': 'secFileNumber',
15
+ 'formData_coverPage_form13FFileNumber': 'form13FFileNumber',
16
+ 'formData_coverPage_reportType': 'reportType',
17
+ 'formData_coverPage_isAmendment': 'isAmendment',
18
+ 'formData_coverPage_amendmentNo': 'amendmentNo',
19
+ 'formData_coverPage_amendmentInfo_amendmentType': 'amendmentType',
20
+ 'formData_coverPage_amendmentInfo_confDeniedExpired': 'confDeniedExpired',
21
+ 'formData_coverPage_additionalInformation': 'additionalInformation',
22
+ 'formData_coverPage_provideInfoForInstruction5': 'provideInfoForInstruction5',
23
+
24
+ # Other Managers Info Mapping
25
+ 'formData_coverPage_otherManagersInfo_otherManager': 'otherManager',
26
+ 'formData_coverPage_otherManagersInfo_otherManager_cik': 'otherManagerCik',
27
+ 'formData_coverPage_otherManagersInfo_otherManager_name': 'otherManagerName',
28
+ 'formData_coverPage_otherManagersInfo_otherManager_crdNumber': 'otherManagerCrdNumber',
29
+ 'formData_coverPage_otherManagersInfo_otherManager_secFileNumber': 'otherManagerSecFileNumber',
30
+ 'formData_coverPage_otherManagersInfo_otherManager_form13FFileNumber': 'otherManagerForm13FFileNumber',
31
+
32
+ # Summary Page Mapping
33
+ 'formData_summaryPage_isConfidentialOmitted': 'isConfidentialOmitted',
34
+ 'formData_summaryPage_otherIncludedManagersCount': 'otherIncludedManagersCount',
35
+ 'formData_summaryPage_tableEntryTotal': 'tableEntryTotal',
36
+ 'formData_summaryPage_tableValueTotal': 'tableValueTotal',
37
+
38
+ # Other Managers 2 Info Mapping
39
+ 'formData_summaryPage_otherManagers2Info_otherManager2': 'otherManager2',
40
+ 'formData_summaryPage_otherManagers2Info_otherManager2_sequenceNumber': 'otherManager2SequenceNumber',
41
+ 'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_cik': 'otherManager2Cik',
42
+ 'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_name': 'otherManager2Name',
43
+ 'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_crdNumber': 'otherManager2CrdNumber',
44
+ 'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_secFileNumber': 'otherManager2SecFileNumber',
45
+ 'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_form13FFileNumber': 'otherManager2Form13FFileNumber',
46
+
47
+ # Signature Block Mapping
48
+ 'formData_signatureBlock_name': 'signatureName',
49
+ 'formData_signatureBlock_title': 'signatureTitle',
50
+ 'formData_signatureBlock_phone': 'signaturePhone',
51
+ 'formData_signatureBlock_signature': 'signature',
52
+ 'formData_signatureBlock_city': 'signatureCity',
53
+ 'formData_signatureBlock_stateOrCountry': 'signatureStateOrCountry',
54
+ 'formData_signatureBlock_signatureDate': 'signatureDate',
55
+
56
+ # Header Data Mapping
57
+ 'headerData_filerInfo_periodOfReport': 'periodOfReport',
58
+ 'headerData_filerInfo_filer_fileNumber': 'filerFileNumber',
59
+ 'headerData_filerInfo_filer_credentials_cik': 'filerCik',
60
+ 'headerData_filerInfo_filer_credentials_ccc': 'filerCcc',
61
+ 'headerData_filerInfo_flags_confirmingCopyFlag': 'confirmingCopyFlag',
62
+ 'headerData_filerInfo_flags_returnCopyFlag': 'returnCopyFlag',
63
+ 'headerData_filerInfo_flags_overrideInternetFlag': 'overrideInternetFlag',
64
+ 'headerData_filerInfo_denovoRequest': 'denovoRequest',
65
+ 'headerData_filerInfo_liveTestFlag': 'liveTestFlag',
66
+ 'headerData_submissionType': 'submissionType',
67
+
68
+ # Schema and Metadata Mapping
69
+ 'schemaLocation': 'schemaLocation',
70
+ 'schemaVersion': 'schemaVersion',
71
+ 'accession': 'accessionNumber'
72
+ }
@@ -1,3 +1,4 @@
1
+ # Ready for mass testing
1
2
  # 25-NSE mapping
2
3
  twentyfive_nse_dict = {
3
4
  'descriptionClassSecurity': 'securityDescription',
@@ -17,6 +17,12 @@ def process_tabular_data(self):
17
17
  tables = process_13fhr(self.data, self.accession)
18
18
  elif self.type in ["INFORMATION TABLE"]:
19
19
  tables = process_information_table(self.data, self.accession)
20
+ elif self.type in ["25-NSE", "25-NSE/A"]:
21
+ tables = process_25nse(self.data, self.accession)
22
+ # complete mark:
23
+ elif self.type in ["N-PX","N-PX/A"]:
24
+ tables = process_npx(self.data, self.accession)
25
+
20
26
  elif self.type in ["SBSEF","SBSEF/A","SBSEF-V","SBSEF-W"]:
21
27
  tables = process_sbsef(self.data, self.accession)
22
28
  elif self.type in ["SDR","SDR/A","SDR-W","SDR-A"]:
@@ -33,8 +39,7 @@ def process_tabular_data(self):
33
39
  tables = process_144(self.data, self.accession)
34
40
  elif self.type in ["24F-2NT", "24F-2NT/A"]:
35
41
  tables = process_24f2nt(self.data, self.accession)
36
- elif self.type in ["25-NSE", "25-NSE/A"]:
37
- tables = process_25nse(self.data, self.accession)
42
+
38
43
  elif self.type in ["ATS-N", "ATS-N/A"]:
39
44
  tables = process_ats(self.data, self.accession)
40
45
  # elif self.type in ["C","C-W","C-U","C-U-W","C/A","C/A-W",
@@ -53,8 +58,7 @@ def process_tabular_data(self):
53
58
  # tables = process_nmfp(self.data, self.accession)
54
59
  # elif self.type in ["NPORT-P","NPORT-P/A"]:
55
60
  # tables = process_nportp(self.data, self.accession)
56
- elif self.type in ["N-PX","N-PX/A"]:
57
- tables = process_npx(self.data, self.accession)
61
+
58
62
  # elif self.type in ["TA-1","TA-1/A","TA-W","TA-2","TA-2/A"]:
59
63
  # tables = process_ta(self.data, self.accession)
60
64
  elif self.type in ["X-17A-5","X-17A-5/A"]:
@@ -70,6 +74,8 @@ def process_tabular_data(self):
70
74
  # tables = process_ex102_abs(self.data, self.accession)
71
75
  elif self.type == "PROXY VOTING RECORD":
72
76
  tables = process_proxy_voting_record(self.data, self.accession)
77
+ elif self.type == 'submission_metadata':
78
+ tables = process_submission_metadata(self.content, self.accession)
73
79
  else:
74
80
  warn(f"Processing for {self.type} is not implemented yet.")
75
81
  return []
@@ -601,4 +607,28 @@ def process_reg_a(data, accession):
601
607
  # raise NotImplementedError("Need to implement the rest of the MA processing")
602
608
 
603
609
  # def process_ncen(data, accession):
604
- # raise NotImplementedError("Need to implement the N-CEN processing")
610
+ # raise NotImplementedError("Need to implement the N-CEN processing")
611
+
612
+ # WIP
613
+ # Note: going to pause this for now, as I don't have a great way of putting this in a csv.
614
+ def process_submission_metadata(data,accession):
615
+ tables = []
616
+ document_data = safe_get(data, ['documents'])
617
+ if document_data:
618
+ tables.append(Table(_flatten_dict(document_data), 'document_submission_metadata', accession))
619
+
620
+ reporting_owner_data = safe_get(data,['reporting-owner'])
621
+ if reporting_owner_data:
622
+ tables.append(Table(_flatten_dict(reporting_owner_data), 'reporting_owner_submission_metadata', accession))
623
+
624
+ issuer_data = safe_get(data,['issuer'])
625
+ if issuer_data:
626
+ tables.append(Table(_flatten_dict(issuer_data), 'issuer_submission_metadata', accession))
627
+
628
+ # # construct metadata
629
+ # accession-number date-of-filing-date-change, depositor-cik effectiveness-date
630
+
631
+ # # other tables
632
+ # depositor, securitizer
633
+
634
+ return tables
@@ -18,7 +18,7 @@ from .mappings.thirteenfhr import *
18
18
  from .mappings.twentyfivense import *
19
19
  from .mappings.twentyfourf2nt import *
20
20
  from .mappings.information_table import *
21
-
21
+ from .mappings.submission_metadata import *
22
22
  # need to check if mappings correctly create new columns
23
23
  class Table():
24
24
  def __init__(self, data, type,accession):
@@ -228,6 +228,11 @@ class Table():
228
228
  elif self.type == 'signature_info_schedule_a':
229
229
  mapping_dict = signature_24f2nt_dict
230
230
 
231
+ # submission metadata
232
+ elif self.type == 'document_submission_metadata':
233
+ mapping_dict = document_submission_metadata_dict
234
+
235
+
231
236
  else:
232
237
  mapping_dict = {}
233
238
 
@@ -79,7 +79,16 @@ def _process_cik_and_metadata_filters(cik=None, ticker=None, **kwargs):
79
79
 
80
80
  # Convert ticker to CIK if provided
81
81
  if ticker is not None:
82
- cik = get_cik_from_dataset('listed_filer_metadata', 'ticker', ticker)
82
+ if isinstance(ticker, str):
83
+ ticker = [ticker]
84
+
85
+ ciks_from_ticker = []
86
+ for t in ticker:
87
+ ciks = get_cik_from_dataset('listed_filer_metadata', 'ticker', t)
88
+ if ciks:
89
+ ciks_from_ticker.extend(ciks)
90
+
91
+ cik = ciks
83
92
 
84
93
  # Normalize CIK format
85
94
  if cik is not None:
@@ -1,16 +1,16 @@
1
- from pathlib import Path
1
+
2
2
  from .sec.submissions.textsearch import query
3
- from .helper import _process_cik_and_metadata_filters, load_package_dataset
3
+ from .helper import _process_cik_and_metadata_filters
4
+ from pathlib import Path
4
5
 
5
6
  class Index:
6
- def __init__(self, path=None):
7
- self.path = Path(path) if path else None
7
+ def __init__(self):
8
+ pass
8
9
 
9
10
  def search_submissions(
10
11
  self,
11
12
  text_query,
12
- start_date=None,
13
- end_date=None,
13
+ filing_date=None,
14
14
  submission_type=None,
15
15
  cik=None,
16
16
  ticker=None,
@@ -47,16 +47,14 @@ class Index:
47
47
  # Execute the search query
48
48
  results = query(
49
49
  f'{text_query}',
50
- filing_date=(start_date, end_date),
50
+ filing_date=filing_date,
51
51
  requests_per_second=requests_per_second,
52
52
  quiet=quiet,
53
53
  submission_type=submission_type,
54
54
  **kwargs
55
55
  )
56
56
 
57
- # Save results to path if specified
58
- if self.path:
59
- self._save_results(results, text_query)
57
+
60
58
 
61
59
  return results
62
60
 
@@ -9,22 +9,28 @@ import os
9
9
  from .helper import _process_cik_and_metadata_filters
10
10
  from .seclibrary.downloader import download as seclibrary_download
11
11
  from .sec.xbrl.filter_xbrl import filter_xbrl
12
- from .sec.submissions.monitor import monitor
13
- from .sec.xbrl.xbrlmonitor import XBRLMonitor
12
+ from .sec.submissions.monitor import Monitor
13
+ #from .sec.xbrl.xbrlmonitor import XBRLMonitor
14
14
 
15
15
 
16
16
  class Portfolio:
17
17
  def __init__(self, path):
18
18
  self.path = Path(path)
19
+ self.api_key = None
19
20
  self.submissions = []
20
21
  self.submissions_loaded = False
21
22
  self.MAX_WORKERS = os.cpu_count() - 1
23
+
24
+ self.monitor = Monitor()
22
25
 
23
26
  if self.path.exists():
24
27
  self._load_submissions()
25
28
  self.submissions_loaded = True
26
29
  else:
27
30
  self.path.mkdir(parents=True, exist_ok=True)
31
+
32
+ def set_api_key(self, api_key):
33
+ self.api_key = api_key
28
34
 
29
35
  def _load_submissions(self):
30
36
  folders = [f for f in self.path.iterdir() if f.is_dir()]
@@ -132,6 +138,7 @@ class Portfolio:
132
138
  seclibrary_download(
133
139
  output_dir=self.path,
134
140
  cik=cik,
141
+ api_key=self.api_key,
135
142
  submission_type=submission_type,
136
143
  filing_date=filing_date,
137
144
  accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
@@ -149,20 +156,18 @@ class Portfolio:
149
156
  )
150
157
 
151
158
  self.submissions_loaded = False
152
- def monitor_submissions(self,data_callback=None, poll_callback=None, submission_type=None, cik=None,
153
- polling_interval=200, requests_per_second=5, quiet=False, start_date=None, ticker=None, **kwargs):
159
+ def monitor_submissions(self, data_callback=None, interval_callback=None,
160
+ polling_interval=1000, quiet=True, start_date=None,
161
+ validation_interval=600000):
154
162
 
155
- cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
156
163
 
157
- monitor(
164
+ self.monitor.monitor_submissions(
158
165
  data_callback=data_callback,
159
- poll_callback=poll_callback,
160
- cik=cik,
161
- submission_type=submission_type,
166
+ interval_callback=interval_callback,
162
167
  polling_interval=polling_interval,
163
- requests_per_second=requests_per_second,
164
168
  quiet=quiet,
165
- start_date=start_date
169
+ start_date=start_date,
170
+ validation_interval=validation_interval
166
171
  )
167
172
 
168
173
 
@@ -179,8 +184,4 @@ class Portfolio:
179
184
  document_types = [document_types]
180
185
 
181
186
  for submission in self.submissions:
182
- yield from submission.document_type(document_types)
183
-
184
- def keep(self,document_type):
185
- for submission in self.__iter__():
186
- submission.keep(document_type)
187
+ yield from submission.document_type(document_types)
@@ -0,0 +1,183 @@
1
+ import time
2
+ from collections import deque
3
+ from datetime import datetime
4
+ import xml.etree.ElementTree as ET
5
+ import re
6
+ import asyncio
7
+ from ..utils import headers, PreciseRateLimiter
8
+ from .eftsquery import EFTSQuery
9
+ import aiohttp
10
+
11
+
12
+ async def poll_rss(limiter):
13
+ base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
14
+
15
+ # Create a session specifically for this RSS polling operation
16
+ async with aiohttp.ClientSession(headers=headers) as session:
17
+ # Use the rate limiter before making the request
18
+ async with limiter:
19
+ # Make the HTTP request with the session
20
+ async with session.get(base_url) as response:
21
+ content = await response.read()
22
+
23
+ # Process the content
24
+ content_str = content.decode('utf-8')
25
+ root = ET.fromstring(content_str)
26
+ namespace = {'atom': 'http://www.w3.org/2005/Atom'}
27
+ entries = root.findall('atom:entry', namespace)
28
+ grouped = {}
29
+
30
+ for entry in entries:
31
+ url = entry.find('atom:link', namespace).get('href')
32
+ accession = re.search(r'/(\d{10})-(\d{2})-(\d{6})', url)
33
+ accession = accession.group(1) + accession.group(2) + accession.group(3)
34
+ cik = re.search(r'/data/(\d+)/', url).group(1)
35
+
36
+ if accession not in grouped:
37
+ grouped[accession] = {'submission_type': '', 'ciks': set(), 'filing_date': ''}
38
+
39
+ grouped[accession]['ciks'].add(cik)
40
+ grouped[accession]['submission_type'] = entry.find('atom:category', namespace).get('term')
41
+ summary_text = entry.find('atom:summary', namespace).text
42
+ filing_date_match = re.search(r'Filed:</b>\s*(\d{4}-\d{2}-\d{2})', summary_text)
43
+ if filing_date_match:
44
+ grouped[accession]['filing_date'] = filing_date_match.group(1)
45
+
46
+ results = [{'accession': int(k.replace('-', '')), 'submission_type': v['submission_type'], 'ciks': list(v['ciks']), 'filing_date': v['filing_date']} for k, v in grouped.items()]
47
+ return results
48
+
49
+ def clean_efts_hits(hits):
50
+ # clean hits
51
+ hits = [{'accession': int(hit['_source']['adsh'].replace('-','')), 'filing_date': hit['_source']['file_date'], 'ciks': hit['_source']['ciks']} for hit in hits]
52
+ return hits
53
+
54
+ class Monitor():
55
+ def __init__(self):
56
+ self.accessions = deque(maxlen=50000)
57
+ self.ratelimiters = {'sec.gov': PreciseRateLimiter(rate=5)}
58
+ self.efts_query = EFTSQuery(quiet=True)
59
+ self.efts_query.limiter = self.ratelimiters['sec.gov']
60
+
61
+ def set_domain_rate_limit(self, domain, rate):
62
+ self.ratelimiters[domain] = PreciseRateLimiter(rate=rate)
63
+ if domain == 'sec.gov':
64
+ self.efts_query.limiter = self.ratelimiters[domain]
65
+
66
+ async def _async_run_efts_query(self, **kwargs):
67
+ """Async helper method to run EFTS query without creating a new event loop"""
68
+ # Make sure to set quiet parameter if provided in kwargs
69
+ self.efts_query.quiet = kwargs.get('quiet', True)
70
+ return await self.efts_query.query(
71
+ cik=kwargs.get('cik'),
72
+ submission_type=kwargs.get('submission_type'),
73
+ filing_date=kwargs.get('filing_date'),
74
+ location=kwargs.get('location'),
75
+ callback=kwargs.get('callback'),
76
+ name=kwargs.get('name')
77
+ )
78
+
79
+ async def _async_monitor_submissions(self, data_callback=None, interval_callback=None,
80
+ polling_interval=1000, quiet=True, start_date=None,
81
+ validation_interval=60000):
82
+ """
83
+ Async implementation of monitor_submissions.
84
+ """
85
+
86
+ # Backfill if start_date is provided
87
+ if start_date is not None:
88
+ today_date = datetime.now().date().strftime('%Y-%m-%d')
89
+ if not quiet:
90
+ print(f"Backfilling from {start_date} to {today_date}")
91
+
92
+ hits = clean_efts_hits(await self._async_run_efts_query(
93
+ filing_date=(start_date, today_date),
94
+ quiet=quiet
95
+ ))
96
+
97
+ new_hits = self._filter_new_accessions(hits)
98
+ if not quiet:
99
+ print(f"New submissions found: {len(new_hits)}")
100
+ if new_hits and data_callback:
101
+ data_callback(new_hits)
102
+
103
+ last_polling_time = time.time()
104
+ last_validation_time = last_polling_time
105
+ current_time = last_polling_time
106
+
107
+ while True:
108
+ # RSS polling
109
+ if not quiet:
110
+ print(f"Polling RSS feed")
111
+ results = await poll_rss(self.ratelimiters['sec.gov'])
112
+ new_results = self._filter_new_accessions(results)
113
+ if new_results:
114
+ if not quiet:
115
+ print(f"Found {len(new_results)} new submissions via RSS")
116
+ if data_callback:
117
+ data_callback(new_results)
118
+
119
+ # EFTS validation
120
+ if validation_interval and (current_time - last_validation_time) >= validation_interval/1000:
121
+ # Get submissions from the last 24 hours for validation
122
+ today_date = datetime.now().strftime('%Y-%m-%d')
123
+ if not quiet:
124
+ print(f"Validating submissions from {today_date}")
125
+
126
+ hits = clean_efts_hits(await self._async_run_efts_query(
127
+ filing_date=(today_date, today_date),
128
+ quiet=quiet
129
+ ))
130
+
131
+ new_hits = self._filter_new_accessions(hits)
132
+ if new_hits:
133
+ if not quiet:
134
+ print(f"Found {len(new_hits)} new submissions via EFTS validation")
135
+ if data_callback:
136
+ data_callback(new_hits)
137
+ last_polling_time = time.time()
138
+ last_validation_time = current_time
139
+
140
+ # Interval callback
141
+ if interval_callback:
142
+ interval_callback()
143
+
144
+ next_poll_time = last_polling_time + (polling_interval / 1000)
145
+ current_time = time.time()
146
+ time_to_sleep = max(0, next_poll_time - current_time)
147
+ await asyncio.sleep(time_to_sleep)
148
+ last_polling_time = next_poll_time
149
+
150
+
151
+ def monitor_submissions(self, data_callback=None, interval_callback=None,
152
+ polling_interval=1000, quiet=True, start_date=None,
153
+ validation_interval=60000):
154
+ """
155
+ Monitor SEC submissions using the EDGAR system.
156
+ :param data_callback: function to call with the data
157
+ :param interval_callback: function that executes between polls
158
+ :param polling_interval: interval between polls in milliseconds
159
+ :param quiet: if True, suppresses output
160
+ :param start_date: backfill start date in YYYY-MM-DD format
161
+ :param validation_interval: interval between validation in milliseconds
162
+
163
+ This function combines the speed of the RSS feed (fast, but misses some submissions) with the accuracy of the EFTS system.
164
+ """
165
+ # This is now a synchronous wrapper around the async implementation
166
+ return asyncio.run(self._async_monitor_submissions(
167
+ data_callback=data_callback,
168
+ interval_callback=interval_callback,
169
+ polling_interval=polling_interval,
170
+ quiet=quiet,
171
+ start_date=start_date,
172
+ validation_interval=validation_interval
173
+ ))
174
+
175
+ def _filter_new_accessions(self, items):
176
+ """Filter items to only include those with new accession numbers."""
177
+ new_items = []
178
+ for item in items:
179
+ accession = item['accession']
180
+ if accession not in self.accessions:
181
+ self.accessions.append(accession)
182
+ new_items.append(item)
183
+ return new_items
@@ -1,8 +1,4 @@
1
1
  import asyncio
2
- import aiohttp
3
- from datetime import datetime
4
- from urllib.parse import urlencode
5
- from tqdm import tqdm
6
2
  from .eftsquery import EFTSQuery
7
3
 
8
4
  class TextSearchEFTSQuery(EFTSQuery):
@@ -2,7 +2,7 @@ import asyncio
2
2
  import aiohttp
3
3
  import json
4
4
  from tqdm import tqdm
5
- from ..utils import PreciseRateLimiter, RateMonitor, RetryException, headers
5
+ from ..utils import PreciseRateLimiter, RateMonitor, headers
6
6
 
7
7
  async def fetch_company_facts(session, cik, rate_limiter, rate_monitor, pbar):
8
8
  # Format CIK with leading zeros to 10 digits