datamule 1.2.4__py3-none-any.whl → 1.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/__init__.py CHANGED
@@ -8,6 +8,7 @@ from .index import Index
8
8
  from .package_updater import PackageUpdater
9
9
 
10
10
 
11
+
11
12
  # Keep the notebook environment setup
12
13
  def _is_notebook_env():
13
14
  """Check if the code is running in a Jupyter or Colab environment."""
@@ -118,10 +118,11 @@ class Document:
118
118
  # will deprecate this when we add html2dict
119
119
  elif self.extension in ['.htm', '.html','.txt']:
120
120
 
121
- if self.type == '10-K':
122
- mapping_dict = dict_10k
123
- elif self.type == '10-Q':
121
+
122
+ if self.type == '10-Q':
124
123
  mapping_dict = dict_10q
124
+ elif self.type == '10-K':
125
+ mapping_dict = dict_10k
125
126
  elif self.type == '8-K':
126
127
  mapping_dict = dict_8k
127
128
  elif self.type == 'SC 13D':
@@ -141,27 +142,39 @@ class Document:
141
142
  json.dump(self.data, f, indent=2)
142
143
 
143
144
  def to_tabular(self):
144
- if self.extension != '.xml':
145
+ if self.type == 'submission_metadata':
146
+ return process_tabular_data(self)
147
+ elif self.extension != '.xml':
145
148
  return []
146
- self.parse()
147
- return process_tabular_data(self)
149
+ else:
150
+ self.parse()
151
+ return process_tabular_data(self)
148
152
 
149
153
 
150
- def write_csv(self, output_folder, accession_number=None):
154
+ def write_csv(self, output_folder):
155
+ output_folder = Path(output_folder)
156
+ output_folder.mkdir(exist_ok=True)
151
157
 
152
- tables = self.to_tabular(accession_number)
158
+ tables = self.to_tabular()
153
159
 
154
160
  if not tables:
155
161
  return
156
162
 
157
163
  for table in tables:
158
164
  fieldnames = table.columns
159
- output_filename = Path(output_folder) / f"{table.type}.csv"
165
+ output_filename = output_folder / f"{table.type}.csv"
166
+
167
+ # Check if the file already exists
168
+ if output_filename.exists():
160
169
 
161
- with open(output_filename, 'w', newline='') as csvfile:
162
- writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
163
- writer.writeheader()
164
- writer.writerows(table.data)
170
+ with open(output_filename, 'a', newline='') as csvfile:
171
+ writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
172
+ writer.writerows(table.data)
173
+ else:
174
+ with open(output_filename, 'w', newline='') as csvfile:
175
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
176
+ writer.writeheader()
177
+ writer.writerows(table.data)
165
178
 
166
179
 
167
180
  def _document_to_section_text(self, document_data, parent_key=''):
@@ -1,3 +1,4 @@
1
+ # Ready for mass testing
1
2
 
2
3
  # Information Table (13F-HR Securities) mapping
3
4
  information_table_dict = {
@@ -1,4 +1,4 @@
1
- # Mapping dictionaries for SEC filing table types based on actual field occurrences
1
+ # Ready for mass testing
2
2
 
3
3
  # Non-derivative transaction ownership mapping
4
4
  non_derivative_transaction_ownership_dict = {
@@ -1 +1,17 @@
1
- proxy_voting_record_dict = {}
1
+ proxy_voting_record_dict = {
2
+ 'meetingDate': 'meetingDate',
3
+ 'accession': 'accessionNumber',
4
+ 'vote_voteRecord_managementRecommendation': 'managementRecommendation',
5
+ 'sharesVoted': 'sharesVoted', # Top-level sharesVoted
6
+ 'vote_voteRecord_howVoted': 'howVoted',
7
+ 'sharesOnLoan': 'sharesOnLoan',
8
+ 'cusip': 'cusip',
9
+ 'issuerName': 'issuerName',
10
+ 'voteCategories_voteCategory_categoryType': 'categoryType',
11
+ 'voteDescription': 'voteDescription',
12
+ 'voteManager_otherManagers_otherManager': 'otherManager',
13
+ 'vote_voteRecord_sharesVoted': 'recordSharesVoted', # To distinguish from top-level sharesVoted
14
+ 'isin': 'isin',
15
+ 'voteSource': 'voteSource',
16
+ 'voteSeries': 'voteSeries'
17
+ }
@@ -0,0 +1,9 @@
1
+ # Note: submission_metadata is my designation, not SEC for the header of the Submission tag
2
+
3
+ document_submission_metadata_dict = {
4
+ 'accession':'accession',
5
+ 'type':'type',
6
+ 'sequence' : 'sequence',
7
+ 'filename' : 'filename',
8
+ 'description':'description'
9
+ }
@@ -1,5 +1,72 @@
1
+ # Ready for mass testing
1
2
 
2
3
  # 13F-HR (Institutional Investment Manager Holdings) mapping
3
- thirteenfhr_dict = {
4
-
5
- }
4
+ thirteenfhr_dict = {
5
+ # Cover Page Mapping
6
+ 'formData_coverPage_reportCalendarOrQuarter': 'reportCalendarOrQuarter',
7
+ 'formData_coverPage_filingManager_name': 'filingManagerName',
8
+ 'formData_coverPage_filingManager_address_street1': 'filingManagerStreet1',
9
+ 'formData_coverPage_filingManager_address_street2': 'filingManagerStreet2',
10
+ 'formData_coverPage_filingManager_address_city': 'filingManagerCity',
11
+ 'formData_coverPage_filingManager_address_stateOrCountry': 'filingManagerStateOrCountry',
12
+ 'formData_coverPage_filingManager_address_zipCode': 'filingManagerZipCode',
13
+ 'formData_coverPage_crdNumber': 'crdNumber',
14
+ 'formData_coverPage_secFileNumber': 'secFileNumber',
15
+ 'formData_coverPage_form13FFileNumber': 'form13FFileNumber',
16
+ 'formData_coverPage_reportType': 'reportType',
17
+ 'formData_coverPage_isAmendment': 'isAmendment',
18
+ 'formData_coverPage_amendmentNo': 'amendmentNo',
19
+ 'formData_coverPage_amendmentInfo_amendmentType': 'amendmentType',
20
+ 'formData_coverPage_amendmentInfo_confDeniedExpired': 'confDeniedExpired',
21
+ 'formData_coverPage_additionalInformation': 'additionalInformation',
22
+ 'formData_coverPage_provideInfoForInstruction5': 'provideInfoForInstruction5',
23
+
24
+ # Other Managers Info Mapping
25
+ 'formData_coverPage_otherManagersInfo_otherManager': 'otherManager',
26
+ 'formData_coverPage_otherManagersInfo_otherManager_cik': 'otherManagerCik',
27
+ 'formData_coverPage_otherManagersInfo_otherManager_name': 'otherManagerName',
28
+ 'formData_coverPage_otherManagersInfo_otherManager_crdNumber': 'otherManagerCrdNumber',
29
+ 'formData_coverPage_otherManagersInfo_otherManager_secFileNumber': 'otherManagerSecFileNumber',
30
+ 'formData_coverPage_otherManagersInfo_otherManager_form13FFileNumber': 'otherManagerForm13FFileNumber',
31
+
32
+ # Summary Page Mapping
33
+ 'formData_summaryPage_isConfidentialOmitted': 'isConfidentialOmitted',
34
+ 'formData_summaryPage_otherIncludedManagersCount': 'otherIncludedManagersCount',
35
+ 'formData_summaryPage_tableEntryTotal': 'tableEntryTotal',
36
+ 'formData_summaryPage_tableValueTotal': 'tableValueTotal',
37
+
38
+ # Other Managers 2 Info Mapping
39
+ 'formData_summaryPage_otherManagers2Info_otherManager2': 'otherManager2',
40
+ 'formData_summaryPage_otherManagers2Info_otherManager2_sequenceNumber': 'otherManager2SequenceNumber',
41
+ 'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_cik': 'otherManager2Cik',
42
+ 'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_name': 'otherManager2Name',
43
+ 'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_crdNumber': 'otherManager2CrdNumber',
44
+ 'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_secFileNumber': 'otherManager2SecFileNumber',
45
+ 'formData_summaryPage_otherManagers2Info_otherManager2_otherManager_form13FFileNumber': 'otherManager2Form13FFileNumber',
46
+
47
+ # Signature Block Mapping
48
+ 'formData_signatureBlock_name': 'signatureName',
49
+ 'formData_signatureBlock_title': 'signatureTitle',
50
+ 'formData_signatureBlock_phone': 'signaturePhone',
51
+ 'formData_signatureBlock_signature': 'signature',
52
+ 'formData_signatureBlock_city': 'signatureCity',
53
+ 'formData_signatureBlock_stateOrCountry': 'signatureStateOrCountry',
54
+ 'formData_signatureBlock_signatureDate': 'signatureDate',
55
+
56
+ # Header Data Mapping
57
+ 'headerData_filerInfo_periodOfReport': 'periodOfReport',
58
+ 'headerData_filerInfo_filer_fileNumber': 'filerFileNumber',
59
+ 'headerData_filerInfo_filer_credentials_cik': 'filerCik',
60
+ 'headerData_filerInfo_filer_credentials_ccc': 'filerCcc',
61
+ 'headerData_filerInfo_flags_confirmingCopyFlag': 'confirmingCopyFlag',
62
+ 'headerData_filerInfo_flags_returnCopyFlag': 'returnCopyFlag',
63
+ 'headerData_filerInfo_flags_overrideInternetFlag': 'overrideInternetFlag',
64
+ 'headerData_filerInfo_denovoRequest': 'denovoRequest',
65
+ 'headerData_filerInfo_liveTestFlag': 'liveTestFlag',
66
+ 'headerData_submissionType': 'submissionType',
67
+
68
+ # Schema and Metadata Mapping
69
+ 'schemaLocation': 'schemaLocation',
70
+ 'schemaVersion': 'schemaVersion',
71
+ 'accession': 'accessionNumber'
72
+ }
@@ -1,3 +1,4 @@
1
+ # Ready for mass testing
1
2
  # 25-NSE mapping
2
3
  twentyfive_nse_dict = {
3
4
  'descriptionClassSecurity': 'securityDescription',
@@ -17,6 +17,12 @@ def process_tabular_data(self):
17
17
  tables = process_13fhr(self.data, self.accession)
18
18
  elif self.type in ["INFORMATION TABLE"]:
19
19
  tables = process_information_table(self.data, self.accession)
20
+ elif self.type in ["25-NSE", "25-NSE/A"]:
21
+ tables = process_25nse(self.data, self.accession)
22
+ # complete mark:
23
+ elif self.type in ["N-PX","N-PX/A"]:
24
+ tables = process_npx(self.data, self.accession)
25
+
20
26
  elif self.type in ["SBSEF","SBSEF/A","SBSEF-V","SBSEF-W"]:
21
27
  tables = process_sbsef(self.data, self.accession)
22
28
  elif self.type in ["SDR","SDR/A","SDR-W","SDR-A"]:
@@ -33,8 +39,7 @@ def process_tabular_data(self):
33
39
  tables = process_144(self.data, self.accession)
34
40
  elif self.type in ["24F-2NT", "24F-2NT/A"]:
35
41
  tables = process_24f2nt(self.data, self.accession)
36
- elif self.type in ["25-NSE", "25-NSE/A"]:
37
- tables = process_25nse(self.data, self.accession)
42
+
38
43
  elif self.type in ["ATS-N", "ATS-N/A"]:
39
44
  tables = process_ats(self.data, self.accession)
40
45
  # elif self.type in ["C","C-W","C-U","C-U-W","C/A","C/A-W",
@@ -53,8 +58,7 @@ def process_tabular_data(self):
53
58
  # tables = process_nmfp(self.data, self.accession)
54
59
  # elif self.type in ["NPORT-P","NPORT-P/A"]:
55
60
  # tables = process_nportp(self.data, self.accession)
56
- elif self.type in ["N-PX","N-PX/A"]:
57
- tables = process_npx(self.data, self.accession)
61
+
58
62
  # elif self.type in ["TA-1","TA-1/A","TA-W","TA-2","TA-2/A"]:
59
63
  # tables = process_ta(self.data, self.accession)
60
64
  elif self.type in ["X-17A-5","X-17A-5/A"]:
@@ -70,6 +74,8 @@ def process_tabular_data(self):
70
74
  # tables = process_ex102_abs(self.data, self.accession)
71
75
  elif self.type == "PROXY VOTING RECORD":
72
76
  tables = process_proxy_voting_record(self.data, self.accession)
77
+ elif self.type == 'submission_metadata':
78
+ tables = process_submission_metadata(self.content, self.accession)
73
79
  else:
74
80
  warn(f"Processing for {self.type} is not implemented yet.")
75
81
  return []
@@ -601,4 +607,28 @@ def process_reg_a(data, accession):
601
607
  # raise NotImplementedError("Need to implement the rest of the MA processing")
602
608
 
603
609
  # def process_ncen(data, accession):
604
- # raise NotImplementedError("Need to implement the N-CEN processing")
610
+ # raise NotImplementedError("Need to implement the N-CEN processing")
611
+
612
+ # WIP
613
+ # Note: going to pause this for now, as I don't have a great way of putting this in a csv.
614
+ def process_submission_metadata(data,accession):
615
+ tables = []
616
+ document_data = safe_get(data, ['documents'])
617
+ if document_data:
618
+ tables.append(Table(_flatten_dict(document_data), 'document_submission_metadata', accession))
619
+
620
+ reporting_owner_data = safe_get(data,['reporting-owner'])
621
+ if reporting_owner_data:
622
+ tables.append(Table(_flatten_dict(reporting_owner_data), 'reporting_owner_submission_metadata', accession))
623
+
624
+ issuer_data = safe_get(data,['issuer'])
625
+ if issuer_data:
626
+ tables.append(Table(_flatten_dict(issuer_data), 'issuer_submission_metadata', accession))
627
+
628
+ # # construct metadata
629
+ # accession-number date-of-filing-date-change, depositor-cik effectiveness-date
630
+
631
+ # # other tables
632
+ # depositor, securitizer
633
+
634
+ return tables
@@ -18,7 +18,7 @@ from .mappings.thirteenfhr import *
18
18
  from .mappings.twentyfivense import *
19
19
  from .mappings.twentyfourf2nt import *
20
20
  from .mappings.information_table import *
21
-
21
+ from .mappings.submission_metadata import *
22
22
  # need to check if mappings correctly create new columns
23
23
  class Table():
24
24
  def __init__(self, data, type,accession):
@@ -228,6 +228,11 @@ class Table():
228
228
  elif self.type == 'signature_info_schedule_a':
229
229
  mapping_dict = signature_24f2nt_dict
230
230
 
231
+ # submission metadata
232
+ elif self.type == 'document_submission_metadata':
233
+ mapping_dict = document_submission_metadata_dict
234
+
235
+
231
236
  else:
232
237
  mapping_dict = {}
233
238
 
datamule/helper.py CHANGED
@@ -79,7 +79,16 @@ def _process_cik_and_metadata_filters(cik=None, ticker=None, **kwargs):
79
79
 
80
80
  # Convert ticker to CIK if provided
81
81
  if ticker is not None:
82
- cik = get_cik_from_dataset('listed_filer_metadata', 'ticker', ticker)
82
+ if isinstance(ticker, str):
83
+ ticker = [ticker]
84
+
85
+ ciks_from_ticker = []
86
+ for t in ticker:
87
+ ciks = get_cik_from_dataset('listed_filer_metadata', 'ticker', t)
88
+ if ciks:
89
+ ciks_from_ticker.extend(ciks)
90
+
91
+ cik = ciks
83
92
 
84
93
  # Normalize CIK format
85
94
  if cik is not None:
datamule/index.py CHANGED
@@ -1,16 +1,16 @@
1
- from pathlib import Path
1
+
2
2
  from .sec.submissions.textsearch import query
3
- from .helper import _process_cik_and_metadata_filters, load_package_dataset
3
+ from .helper import _process_cik_and_metadata_filters
4
+ from pathlib import Path
4
5
 
5
6
  class Index:
6
- def __init__(self, path=None):
7
- self.path = Path(path) if path else None
7
+ def __init__(self):
8
+ pass
8
9
 
9
10
  def search_submissions(
10
11
  self,
11
12
  text_query,
12
- start_date=None,
13
- end_date=None,
13
+ filing_date=None,
14
14
  submission_type=None,
15
15
  cik=None,
16
16
  ticker=None,
@@ -47,16 +47,14 @@ class Index:
47
47
  # Execute the search query
48
48
  results = query(
49
49
  f'{text_query}',
50
- filing_date=(start_date, end_date),
50
+ filing_date=filing_date,
51
51
  requests_per_second=requests_per_second,
52
52
  quiet=quiet,
53
53
  submission_type=submission_type,
54
54
  **kwargs
55
55
  )
56
56
 
57
- # Save results to path if specified
58
- if self.path:
59
- self._save_results(results, text_query)
57
+
60
58
 
61
59
  return results
62
60
 
datamule/portfolio.py CHANGED
@@ -9,22 +9,28 @@ import os
9
9
  from .helper import _process_cik_and_metadata_filters
10
10
  from .seclibrary.downloader import download as seclibrary_download
11
11
  from .sec.xbrl.filter_xbrl import filter_xbrl
12
- from .sec.submissions.monitor import monitor
13
- from .sec.xbrl.xbrlmonitor import XBRLMonitor
12
+ from .sec.submissions.monitor import Monitor
13
+ #from .sec.xbrl.xbrlmonitor import XBRLMonitor
14
14
 
15
15
 
16
16
  class Portfolio:
17
17
  def __init__(self, path):
18
18
  self.path = Path(path)
19
+ self.api_key = None
19
20
  self.submissions = []
20
21
  self.submissions_loaded = False
21
22
  self.MAX_WORKERS = os.cpu_count() - 1
23
+
24
+ self.monitor = Monitor()
22
25
 
23
26
  if self.path.exists():
24
27
  self._load_submissions()
25
28
  self.submissions_loaded = True
26
29
  else:
27
30
  self.path.mkdir(parents=True, exist_ok=True)
31
+
32
+ def set_api_key(self, api_key):
33
+ self.api_key = api_key
28
34
 
29
35
  def _load_submissions(self):
30
36
  folders = [f for f in self.path.iterdir() if f.is_dir()]
@@ -132,6 +138,7 @@ class Portfolio:
132
138
  seclibrary_download(
133
139
  output_dir=self.path,
134
140
  cik=cik,
141
+ api_key=self.api_key,
135
142
  submission_type=submission_type,
136
143
  filing_date=filing_date,
137
144
  accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
@@ -149,20 +156,18 @@ class Portfolio:
149
156
  )
150
157
 
151
158
  self.submissions_loaded = False
152
- def monitor_submissions(self,data_callback=None, poll_callback=None, submission_type=None, cik=None,
153
- polling_interval=200, requests_per_second=5, quiet=False, start_date=None, ticker=None, **kwargs):
159
+ def monitor_submissions(self, data_callback=None, interval_callback=None,
160
+ polling_interval=1000, quiet=True, start_date=None,
161
+ validation_interval=600000):
154
162
 
155
- cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
156
163
 
157
- monitor(
164
+ self.monitor.monitor_submissions(
158
165
  data_callback=data_callback,
159
- poll_callback=poll_callback,
160
- cik=cik,
161
- submission_type=submission_type,
166
+ interval_callback=interval_callback,
162
167
  polling_interval=polling_interval,
163
- requests_per_second=requests_per_second,
164
168
  quiet=quiet,
165
- start_date=start_date
169
+ start_date=start_date,
170
+ validation_interval=validation_interval
166
171
  )
167
172
 
168
173
 
@@ -179,8 +184,4 @@ class Portfolio:
179
184
  document_types = [document_types]
180
185
 
181
186
  for submission in self.submissions:
182
- yield from submission.document_type(document_types)
183
-
184
- def keep(self,document_type):
185
- for submission in self.__iter__():
186
- submission.keep(document_type)
187
+ yield from submission.document_type(document_types)