datamule 1.5.8__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,7 @@ from .processing import process_tabular_data
12
12
  from pathlib import Path
13
13
  import webbrowser
14
14
  from secsgml.utils import bytes_to_str
15
+ from secxbrl import parse_inline_xbrl
15
16
 
16
17
  class Document:
17
18
  def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -33,6 +34,7 @@ class Document:
33
34
  self.extension = extension
34
35
  # this will be filled by parsed
35
36
  self.data = None
37
+ self.xbrl = None
36
38
 
37
39
  #_load_text_content
38
40
  def _preprocess_txt_content(self):
@@ -101,12 +103,23 @@ class Document:
101
103
  if self.extension in ['.htm', '.html', '.txt','.xml']:
102
104
  return bool(re.search(pattern, self.content))
103
105
  return False
106
+
107
+ def parse_xbrl(self,type='inline'):
108
+ if self.xbrl:
109
+ return
110
+ if type =='inline':
111
+ if self.extension not in ['.htm','.html']:
112
+ return
113
+
114
+ self.xbrl = parse_inline_xbrl(self.content)
115
+ else:
116
+ raise ValueError("Only inline has been implemented so far.")
104
117
 
105
118
  # Note: this method will be heavily modified in the future
106
119
  def parse(self):
107
120
  # check if we have already parsed the content
108
121
  if self.data:
109
- return self.data
122
+ return
110
123
 
111
124
  mapping_dict = None
112
125
 
File without changes
@@ -0,0 +1,13 @@
1
+ import ownership
2
+
3
+
4
+ # key is document type
5
+ # note: this assumes XML format.
6
+ table_mappings = {
7
+ '3' : ownership.mappings,
8
+ '3/A' : ownership.mappings,
9
+ '4' : ownership.mappings,
10
+ '4/A' : ownership.mappings,
11
+ '5' : ownership.mappings,
12
+ '5/A' : ownership.mappings,
13
+ }
@@ -0,0 +1,174 @@
1
+
2
+
3
+
4
+ # Non-derivative transaction ownership mapping
5
+ ownership_non_derivative_transactions_dict = {
6
+ 'securityTitle_value': 'securityTitle',
7
+ 'securityTitle_footnote': 'securityTitleFootnote',
8
+ 'transactionDate_value': 'transactionDate',
9
+ 'transactionDate_footnote': 'transactionDateFootnote',
10
+ 'deemedExecutionDate_value': 'deemedExecutionDate',
11
+ 'deemedExecutionDate_footnote': 'deemedExecutionDateFootnote',
12
+ 'transactionCoding_transactionFormType': 'transactionFormType',
13
+ 'transactionCoding_transactionCode': 'transactionCode',
14
+ 'transactionCoding_equitySwapInvolved': 'equitySwapInvolved',
15
+ 'transactionCoding_footnote': 'transactionCodingFootnote',
16
+ 'transactionAmounts_transactionShares_value': 'transactionShares',
17
+ 'transactionAmounts_transactionShares_footnote': 'transactionSharesFootnote',
18
+ 'transactionAmounts_transactionPricePerShare_value': 'transactionPricePerShare',
19
+ 'transactionAmounts_transactionPricePerShare_footnote': 'transactionPricePerShareFootnote',
20
+ 'transactionAmounts_transactionAcquiredDisposedCode_value': 'transactionAcquiredDisposedCode',
21
+ 'transactionAmounts_transactionAcquiredDisposedCode_footnote': 'transactionAcquiredDisposedCodeFootnote',
22
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_value': 'sharesOwnedFollowingTransaction',
23
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_footnote': 'sharesOwnedFollowingTransactionFootnote',
24
+ 'ownershipNature_directOrIndirectOwnership_value': 'directOrIndirectOwnership',
25
+ 'ownershipNature_directOrIndirectOwnership_footnote': 'directOrIndirectOwnershipFootnote',
26
+ 'ownershipNature_natureOfOwnership_value': 'natureOfOwnership',
27
+ 'ownershipNature_natureOfOwnership_footnote': 'natureOfOwnershipFootnote',
28
+ 'transactionTimeliness_value': 'transactionTimeliness',
29
+ 'transactionTimeliness_footnote': 'transactionTimelinessFootnote',
30
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_value': 'valueOwnedFollowingTransaction',
31
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_footnote': 'valueOwnedFollowingTransactionFootnote'
32
+ }
33
+
34
+ # Derivative transaction ownership mapping
35
+ derivative_transaction_ownership_dict = {
36
+ 'securityTitle_value': 'securityTitle',
37
+ 'securityTitle_footnote': 'securityTitleFootnote',
38
+ 'conversionOrExercisePrice_value': 'conversionOrExercisePrice',
39
+ 'conversionOrExercisePrice_footnote': 'conversionOrExercisePriceFootnote',
40
+ 'transactionDate_value': 'transactionDate',
41
+ 'transactionDate_footnote': 'transactionDateFootnote',
42
+ 'deemedExecutionDate_value': 'deemedExecutionDate',
43
+ 'deemedExecutionDate_footnote': 'deemedExecutionDateFootnote',
44
+ 'transactionCoding_transactionFormType': 'transactionFormType',
45
+ 'transactionCoding_transactionCode': 'transactionCode',
46
+ 'transactionCoding_equitySwapInvolved': 'equitySwapInvolved',
47
+ 'transactionCoding_footnote': 'transactionCodingFootnote',
48
+ 'transactionAmounts_transactionShares_value': 'transactionShares',
49
+ 'transactionAmounts_transactionShares_footnote': 'transactionSharesFootnote',
50
+ 'transactionAmounts_transactionPricePerShare_value': 'transactionPricePerShare',
51
+ 'transactionAmounts_transactionPricePerShare_footnote': 'transactionPricePerShareFootnote',
52
+ 'transactionAmounts_transactionAcquiredDisposedCode_value': 'transactionAcquiredDisposedCode',
53
+ 'transactionAmounts_transactionTotalValue_value': 'transactionTotalValue',
54
+ 'transactionAmounts_transactionTotalValue_footnote': 'transactionTotalValueFootnote',
55
+ 'exerciseDate_value': 'exerciseDate',
56
+ 'exerciseDate_footnote': 'exerciseDateFootnote',
57
+ 'expirationDate_value': 'expirationDate',
58
+ 'expirationDate_footnote': 'expirationDateFootnote',
59
+ 'underlyingSecurity_underlyingSecurityTitle_value': 'underlyingSecurityTitle',
60
+ 'underlyingSecurity_underlyingSecurityTitle_footnote': 'underlyingSecurityTitleFootnote',
61
+ 'underlyingSecurity_underlyingSecurityShares_value': 'underlyingSecurityShares',
62
+ 'underlyingSecurity_underlyingSecurityShares_footnote': 'underlyingSecuritySharesFootnote',
63
+ 'underlyingSecurity_underlyingSecurityValue_value': 'underlyingSecurityValue',
64
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_value': 'sharesOwnedFollowingTransaction',
65
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_footnote': 'sharesOwnedFollowingTransactionFootnote',
66
+ 'ownershipNature_directOrIndirectOwnership_value': 'directOrIndirectOwnership',
67
+ 'ownershipNature_directOrIndirectOwnership_footnote': 'directOrIndirectOwnershipFootnote',
68
+ 'ownershipNature_natureOfOwnership_value': 'natureOfOwnership',
69
+ 'ownershipNature_natureOfOwnership_footnote': 'natureOfOwnershipFootnote',
70
+ 'transactionTimeliness_value': 'transactionTimeliness',
71
+ 'transactionTimeliness_footnote': 'transactionTimelinessFootnote',
72
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_value': 'valueOwnedFollowingTransaction',
73
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_footnote': 'valueOwnedFollowingTransactionFootnote',
74
+ 'transactionAmounts_transactionAcquiredDisposedCode_footnote': 'transactionAcquiredDisposedCodeFootnote',
75
+ 'underlyingSecurity_underlyingSecurityValue_footnote': 'underlyingSecurityValueFootnote'
76
+ }
77
+
78
+ # Non-derivative holding ownership mapping
79
+ non_derivative_holding_ownership_dict = {
80
+ 'securityTitle_value': 'securityTitle',
81
+ 'securityTitle_footnote': 'securityTitleFootnote',
82
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_value': 'sharesOwnedFollowingTransaction',
83
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_footnote': 'sharesOwnedFollowingTransactionFootnote',
84
+ 'ownershipNature_directOrIndirectOwnership_value': 'directOrIndirectOwnership',
85
+ 'ownershipNature_directOrIndirectOwnership_footnote': 'directOrIndirectOwnershipFootnote',
86
+ 'ownershipNature_natureOfOwnership_value': 'natureOfOwnership',
87
+ 'ownershipNature_natureOfOwnership_footnote': 'natureOfOwnershipFootnote',
88
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_value': 'valueOwnedFollowingTransaction',
89
+ 'transactionCoding_footnote': 'transactionCodingFootnote',
90
+ 'transactionCoding_transactionFormType': 'transactionFormType',
91
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_footnote': 'valueOwnedFollowingTransactionFootnote'
92
+ }
93
+
94
+ # Derivative holding ownership mapping
95
+ derivative_holding_ownership_dict = {
96
+ 'securityTitle_value': 'securityTitle',
97
+ 'securityTitle_footnote': 'securityTitleFootnote',
98
+ 'conversionOrExercisePrice_value': 'conversionOrExercisePrice',
99
+ 'conversionOrExercisePrice_footnote': 'conversionOrExercisePriceFootnote',
100
+ 'exerciseDate_value': 'exerciseDate',
101
+ 'exerciseDate_footnote': 'exerciseDateFootnote',
102
+ 'expirationDate_value': 'expirationDate',
103
+ 'expirationDate_footnote': 'expirationDateFootnote',
104
+ 'underlyingSecurity_underlyingSecurityTitle_value': 'underlyingSecurityTitle',
105
+ 'underlyingSecurity_underlyingSecurityTitle_footnote': 'underlyingSecurityTitleFootnote',
106
+ 'underlyingSecurity_underlyingSecurityShares_value': 'underlyingSecurityShares',
107
+ 'underlyingSecurity_underlyingSecurityShares_footnote': 'underlyingSecuritySharesFootnote',
108
+ 'underlyingSecurity_underlyingSecurityValue_value': 'underlyingSecurityValue',
109
+ 'underlyingSecurity_underlyingSecurityValue_footnote': 'underlyingSecurityValueFootnote',
110
+ 'ownershipNature_directOrIndirectOwnership_value': 'directOrIndirectOwnership',
111
+ 'ownershipNature_directOrIndirectOwnership_footnote': 'directOrIndirectOwnershipFootnote',
112
+ 'ownershipNature_natureOfOwnership_value': 'natureOfOwnership',
113
+ 'ownershipNature_natureOfOwnership_footnote': 'natureOfOwnershipFootnote',
114
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_value': 'sharesOwnedFollowingTransaction',
115
+ 'postTransactionAmounts_sharesOwnedFollowingTransaction_footnote': 'sharesOwnedFollowingTransactionFootnote',
116
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_value': 'valueOwnedFollowingTransaction',
117
+ 'postTransactionAmounts_valueOwnedFollowingTransaction_footnote': 'valueOwnedFollowingTransactionFootnote',
118
+ 'transactionCoding_transactionFormType': 'transactionFormType',
119
+ 'transactionCoding_footnote': 'transactionCodingFootnote'
120
+ }
121
+
122
+ # Reporting owner ownership mapping
123
+ reporting_owner_ownership_dict = {
124
+ 'reportingOwnerAddress_rptOwnerCity': 'rptOwnerCity',
125
+ 'reportingOwnerAddress_rptOwnerState': 'rptOwnerState',
126
+ 'reportingOwnerAddress_rptOwnerStateDescription': 'rptOwnerStateDescription',
127
+ 'reportingOwnerAddress_rptOwnerStreet1': 'rptOwnerStreet1',
128
+ 'reportingOwnerAddress_rptOwnerStreet2': 'rptOwnerStreet2',
129
+ 'reportingOwnerAddress_rptOwnerZipCode': 'rptOwnerZipCode',
130
+ 'reportingOwnerId_rptOwnerCik': 'rptOwnerCik',
131
+ 'reportingOwnerId_rptOwnerName': 'rptOwnerName',
132
+ 'reportingOwnerRelationship_isDirector': 'rptOwnerIsDirector',
133
+ 'reportingOwnerRelationship_isOfficer': 'rptOwnerIsOfficer',
134
+ 'reportingOwnerRelationship_isTenPercentOwner': 'rptOwnerIsTenPercentOwner',
135
+ 'reportingOwnerRelationship_isOther': 'rptOwnerIsOther',
136
+ 'reportingOwnerRelationship_officerTitle': 'rptOwnerOfficerTitle',
137
+ 'reportingOwnerRelationship_otherText': 'rptOwnerOtherText'
138
+ }
139
+
140
+ # Metadata ownership mapping
141
+ metadata_ownership_dict = {
142
+ 'periodOfReport': 'periodOfReport',
143
+ 'issuer_issuerCik': 'issuerCik',
144
+ 'issuer_issuerName': 'issuerName',
145
+ 'issuer_issuerTradingSymbol': 'issuerTradingSymbol',
146
+ 'documentType': 'documentType',
147
+ 'remarks': 'remarks',
148
+ 'documentDescription': 'documentDescription',
149
+ 'footnotes': 'footnotes',
150
+ 'notSubjectToSection16': 'notSubjectToSection16',
151
+ 'form3HoldingsReported': 'form3HoldingsReported',
152
+ 'form4TransactionsReported': 'form4TransactionsReported',
153
+ 'noSecuritiesOwned': 'noSecuritiesOwned',
154
+ 'aff10b5One': 'aff10b5One',
155
+ 'dateOfOriginalSubmission': 'dateOfOriginalSubmission',
156
+ 'schemaVersion': 'schemaVersion'
157
+ }
158
+
159
+ # Owner signature ownership mapping
160
+ owner_signature_ownership_dict = {
161
+ 'signatureName': 'signatureName',
162
+ 'signatureDate': 'signatureDate'
163
+ }
164
+
165
+
166
+ mappings = {
167
+ 'ownership_non_derivative_transactions' : ownership_non_derivative_transactions_dict,
168
+ 'ownership_derivative_transactions' : derivative_transaction_ownership_dict,
169
+ 'ownership_non_derivative_holdings' : non_derivative_holding_ownership_dict,
170
+ 'ownership_derivative_holdings' : derivative_holding_ownership_dict,
171
+ 'ownership_reporting_owner' : reporting_owner_ownership_dict,
172
+ 'ownership_metadata' : metadata_ownership_dict,
173
+ 'ownership_owner_signature' : owner_signature_ownership_dict
174
+ }
datamule/portfolio.py CHANGED
@@ -127,7 +127,7 @@ class Portfolio:
127
127
  self.accession_numbers = new_accession_numbers
128
128
 
129
129
  def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],
130
- requests_per_second=5,keep_filtered_metadata=False,standardize_metadata=True, **kwargs):
130
+ requests_per_second=5,keep_filtered_metadata=False,standardize_metadata=True,skip_existing=True, **kwargs):
131
131
  if provider is None:
132
132
  config = Config()
133
133
  provider = config.get_default_source()
@@ -135,6 +135,11 @@ class Portfolio:
135
135
  # Process CIK and metadata filters
136
136
  cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
137
137
 
138
+ accession_numbers = self.accession_numbers if hasattr(self, 'accession_numbers') else None
139
+ skip_accession_numbers = []
140
+ if skip_existing:
141
+ skip_accession_numbers = [sub.accession for sub in self]
142
+
138
143
  if provider == 'datamule':
139
144
 
140
145
  seclibrary_download(
@@ -143,10 +148,11 @@ class Portfolio:
143
148
  api_key=self.api_key,
144
149
  submission_type=submission_type,
145
150
  filing_date=filing_date,
146
- accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
151
+ accession_numbers=accession_numbers,
147
152
  keep_document_types=document_type,
148
153
  keep_filtered_metadata=keep_filtered_metadata,
149
154
  standardize_metadata=standardize_metadata,
155
+ skip_accession_numbers=skip_accession_numbers
150
156
  )
151
157
  else:
152
158
  sec_download(
@@ -155,10 +161,11 @@ class Portfolio:
155
161
  submission_type=submission_type,
156
162
  filing_date=filing_date,
157
163
  requests_per_second=requests_per_second,
158
- accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
164
+ accession_numbers=accession_numbers,
159
165
  keep_document_types=document_type,
160
166
  keep_filtered_metadata=keep_filtered_metadata,
161
167
  standardize_metadata=standardize_metadata,
168
+ skip_accession_numbers=skip_accession_numbers
162
169
  )
163
170
 
164
171
  self.submissions_loaded = False
@@ -5,7 +5,8 @@ from tqdm import tqdm
5
5
 
6
6
  def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
7
7
  requests_per_second=5, output_dir="filings", accession_numbers=None,
8
- quiet=False, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
8
+ quiet=False, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
9
+ skip_accession_numbers=[]):
9
10
  # Make sure output directory exists
10
11
  os.makedirs(output_dir, exist_ok=True)
11
12
 
@@ -29,5 +30,6 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
29
30
  requests_per_second=requests_per_second,
30
31
  document_callback=callback_wrapper,
31
32
  accession_numbers=accession_numbers,
33
+ skip_accession_numbers=skip_accession_numbers,
32
34
  quiet=quiet
33
35
  )
@@ -21,7 +21,7 @@ def fix_filing_url(url):
21
21
  return url
22
22
 
23
23
  class Streamer(EFTSQuery):
24
- def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None, quiet=False):
24
+ def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None,skip_accession_numbers=None, quiet=False):
25
25
  super().__init__(requests_per_second=requests_per_second, quiet=quiet)
26
26
  self.document_callback = document_callback
27
27
  self.document_queue = asyncio.Queue()
@@ -32,6 +32,7 @@ class Streamer(EFTSQuery):
32
32
  self.documents_processed = 0
33
33
  self.total_documents = 0
34
34
  self.accession_numbers = accession_numbers
35
+ self.skip_accession_numbers = skip_accession_numbers
35
36
  self.skipped_documents = 0
36
37
 
37
38
  async def _fetch_worker(self):
@@ -81,6 +82,9 @@ class Streamer(EFTSQuery):
81
82
  if self.accession_numbers is not None and accno_w_dash not in self.accession_numbers:
82
83
  return None, None, None
83
84
 
85
+ if self.skip_accession_numbers is not None and accno_w_dash in self.skip_accession_numbers:
86
+ return None, None, None
87
+
84
88
  # Construct the URL
85
89
  url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accno_no_dash}/{accno_w_dash}.txt"
86
90
  url = fix_filing_url(url)
@@ -218,7 +222,7 @@ class Streamer(EFTSQuery):
218
222
  return results
219
223
 
220
224
  def stream(cik=None, submission_type=None, filing_date=None, location=None,
221
- requests_per_second=5.0, document_callback=None, accession_numbers=None,
225
+ requests_per_second=5.0, document_callback=None, accession_numbers=None,skip_accession_numbers=[],
222
226
  quiet=False, name=None):
223
227
  """
224
228
  Stream EFTS results and download documents into memory.
@@ -257,6 +261,7 @@ def stream(cik=None, submission_type=None, filing_date=None, location=None,
257
261
  requests_per_second=requests_per_second,
258
262
  document_callback=document_callback,
259
263
  accession_numbers=accession_numbers,
264
+ skip_accession_numbers=skip_accession_numbers,
260
265
  quiet=quiet
261
266
  )
262
267
  return await streamer.stream(cik, submission_type, filing_date, location, name)
@@ -5,8 +5,6 @@ from ..utils import headers
5
5
  def fetch_frame(taxonomy, concept, unit, period):
6
6
  url = f"https://data.sec.gov/api/xbrl/frames/{taxonomy}/{concept}/{unit}/{period}.json"
7
7
  response = requests.get(url, headers=headers)
8
- print(url)
9
- print(response)
10
8
  return response.json()
11
9
 
12
10
 
@@ -14,7 +14,6 @@ from queue import Queue, Empty
14
14
  from threading import Thread
15
15
  from .query import query
16
16
  from os import cpu_count
17
- from ..submission import Submission
18
17
  from secsgml import write_sgml_file_to_tar
19
18
 
20
19
 
@@ -235,7 +234,8 @@ class Downloader:
235
234
  processor.stop_workers()
236
235
  decompression_pool.shutdown()
237
236
 
238
- def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
237
+ def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True,
238
+ skip_accession_numbers=[]):
239
239
  """
240
240
  Query SEC filings and download/process them.
241
241
 
@@ -259,10 +259,18 @@ class Downloader:
259
259
  filing_date=filing_date,
260
260
  api_key=self.api_key
261
261
  )
262
+
263
+
262
264
  # After querying but before generating URLs
263
265
  if accession_numbers:
266
+ accession_numbers = [str(int(item.replace('-',''))) for item in accession_numbers]
264
267
  filings = [filing for filing in filings if filing['accession_number'] in accession_numbers]
265
268
 
269
+
270
+ if skip_accession_numbers:
271
+ skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
272
+ filings = [filing for filing in filings if filing['accession_number'] not in skip_accession_numbers]
273
+
266
274
  # Generate URLs from the query results
267
275
 
268
276
  print(f"Generating URLs for {len(filings)} filings...")
@@ -355,7 +363,8 @@ class Downloader:
355
363
  print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
356
364
 
357
365
 
358
- def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
366
+ def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
367
+ skip_accession_numbers=[]):
359
368
  """
360
369
  Query SEC filings and download/process them.
361
370
 
@@ -383,28 +392,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
383
392
  accession_numbers=accession_numbers,
384
393
  keep_document_types=keep_document_types,
385
394
  keep_filtered_metadata=keep_filtered_metadata,
386
- standardize_metadata=standardize_metadata
395
+ standardize_metadata=standardize_metadata,
396
+ skip_accession_numbers=skip_accession_numbers
387
397
  )
388
-
389
- def download_files_using_filename(filenames, api_key=None, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
390
- """
391
- Download and process SEC filings using specific filenames.
392
-
393
- Parameters:
394
- - filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
395
- - api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
396
- - output_dir: Directory to save downloaded files
397
- - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
398
- - keep_filtered_metadata: Whether to keep metadata for filtered documents
399
- - standardize_metadata: Whether to standardize metadata format
400
- """
401
- downloader = Downloader(api_key=api_key)
402
- downloader.QUEUE_SIZE = 1
403
- downloader.MAX_CONCURRENT_DOWNLOADS = 1
404
- downloader.download_files_using_filename(
405
- filenames=filenames,
406
- output_dir=output_dir,
407
- keep_document_types=keep_document_types,
408
- keep_filtered_metadata=keep_filtered_metadata,
409
- standardize_metadata=standardize_metadata
410
- )
datamule/submission.py CHANGED
@@ -251,7 +251,8 @@ class Submission:
251
251
  try:
252
252
  content = tar.extractfile(filename+'.zst').read()
253
253
  except:
254
- raise ValueError("Something went wrong with tar")
254
+ # some of these issues are on SEC data end, will fix when I setup cloud.
255
+ raise ValueError(f"Something went wrong with tar: {self.path}")
255
256
  # Decompress if compressed
256
257
  if filename.endswith('.gz'):
257
258
  content = gzip.decompress(content)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.5.8
3
+ Version: 1.6.0
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -15,6 +15,7 @@ Requires-Dist: selectolax
15
15
  Requires-Dist: pytz
16
16
  Requires-Dist: zstandard
17
17
  Requires-Dist: doc2dict
18
+ Requires-Dist: secxbrl
18
19
  Requires-Dist: secsgml
19
20
  Requires-Dist: websocket-client
20
21
 
@@ -3,14 +3,14 @@ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
3
  datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
4
4
  datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
5
5
  datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
6
- datamule/portfolio.py,sha256=Ijx4JFRHSzPoGJRdOTv8c90x79M80LlAXUhUncwYZSo,7755
6
+ datamule/portfolio.py,sha256=eF1eDSwIg-CI8ZmZAHRjCGU0UhuPN4ijxPB0YDT4s2o,8023
7
7
  datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
8
- datamule/submission.py,sha256=6JIi-ayLL-jENVj6Q4IhmrYlAreJI7xBAHP_NYaDB6k,12918
8
+ datamule/submission.py,sha256=vAiYNas1YrWgm4Grw24peJbfSUVERySEko1zmdtG49s,13033
9
9
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
10
10
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  datamule/datamule/sec_connector.py,sha256=T3edE7I-d4oHysqj7zYlIOxH3Fuauj9tfw39UdFWvB8,2393
12
12
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- datamule/document/document.py,sha256=04Rivdphq0D1HEGIBjtl1LelJr-IyQU1qCMi8yNJajw,14038
13
+ datamule/document/document.py,sha256=YGo-Iz_sBXekUeKEAoNJV0BiLDtSOgD9OXFo2FocYq8,14439
14
14
  datamule/document/processing.py,sha256=jDCEzBFDSQtq7nQxRScIsbALnFcvMPOkNkMUCa7mFxg,31921
15
15
  datamule/document/table.py,sha256=73yUJKY82ap32jhLmZeTti-jQ_lyhcJGlGwyxLtgYOg,12944
16
16
  datamule/document/mappings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -37,6 +37,9 @@ datamule/document/mappings/ta.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
37
37
  datamule/document/mappings/thirteenfhr.py,sha256=XpYRIMPZnGLfEE4TqBI0BPXbyuq0xf3hut1fePOF6kU,4250
38
38
  datamule/document/mappings/twentyfivense.py,sha256=lKyj0ZBhkHX9gQJMTUPrQlxYFg3k-aBnWqtoS5bujZM,905
39
39
  datamule/document/mappings/twentyfourf2nt.py,sha256=Q7RPT3JgJHjYdjMuaSyAxclt6QPT_LgCQloxp-ByDuI,4118
40
+ datamule/document/mappings_new/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ datamule/document/mappings_new/mappings.py,sha256=sP94GK3-klMCTD6XFajAP9KxJ7Wq5YMMaXcHx1rQEKA,281
42
+ datamule/document/mappings_new/ownership.py,sha256=GVtyROefvEC_X5l6kayvZv57-kHxj8bHckAru8JtFOQ,10656
40
43
  datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
44
  datamule/mapping_dicts/html_mapping_dicts.py,sha256=G2PWB__FNg4VH9iFJFkflM0u-qOEtk67IWtGoqesb0k,5388
42
45
  datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
@@ -46,21 +49,21 @@ datamule/sec/utils.py,sha256=JUxwijJiqRMnRJNQzVUamyF5h9ZGc7RnO_zsLOIM73g,2079
46
49
  datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
50
  datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNsBw5Jv0Tx5aljiGUJkk7DRk,18745
48
51
  datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
- datamule/sec/submissions/downloader.py,sha256=tDWn8bsK9XabQo2pBGYSiqTw37MmqM8rEma8Ph7zp-o,1391
52
+ datamule/sec/submissions/downloader.py,sha256=zGS0oJJI8tVF_GnVpZm20MymdYxnjrEjQioSVggw7Ck,1486
50
53
  datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
51
54
  datamule/sec/submissions/monitor.py,sha256=ll0nfHzG8FI3bA8zVFrfsfZGnbt5qAD4rRZ4LG2SORY,9567
52
- datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
55
+ datamule/sec/submissions/streamer.py,sha256=Qydj40CmWB_wsPv2dibefRohmCokegG2pR7iZ9C3xLQ,11584
53
56
  datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
54
57
  datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
58
  datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
56
- datamule/sec/xbrl/filter_xbrl.py,sha256=g9OT4zrNS0tiUJeBIwbCs_zMisOBkpFnMR3tV4Tr39Q,1316
59
+ datamule/sec/xbrl/filter_xbrl.py,sha256=QiSfm7tsJVLIw2PFqGh8D01qsRe_ZB-mbFhr6KcBa8A,1281
57
60
  datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H5N2tbvKUzM,3307
58
61
  datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
59
62
  datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
63
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
61
- datamule/seclibrary/downloader.py,sha256=wNRURTGb3eqg12Ltt4578L0WcAm7DmCWg0Rm0Om6Z4U,17959
64
+ datamule/seclibrary/downloader.py,sha256=ylv69VF22IVfrdeCkiGr5mVa2GKrPC9zFiDJU1fiBu8,17262
62
65
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
63
- datamule-1.5.8.dist-info/METADATA,sha256=kfV8_aDjqzk6OZKmJn4GIffpvTW-SYi55O1qSOEnsGQ,501
64
- datamule-1.5.8.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
65
- datamule-1.5.8.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
66
- datamule-1.5.8.dist-info/RECORD,,
66
+ datamule-1.6.0.dist-info/METADATA,sha256=E4F7MeBNWhHn19TH7eUyQN_vnONCvw-NiObNCRbsLE0,524
67
+ datamule-1.6.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
68
+ datamule-1.6.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
69
+ datamule-1.6.0.dist-info/RECORD,,