datamule 1.4.9__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-1.4.9 → datamule-1.5.0}/PKG-INFO +1 -1
- {datamule-1.4.9 → datamule-1.5.0}/datamule/portfolio.py +1 -1
- datamule-1.5.0/datamule/sec/submissions/downloader.py +32 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/seclibrary/downloader.py +8 -15
- datamule-1.5.0/datamule/submission.py +215 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-1.4.9 → datamule-1.5.0}/setup.py +1 -1
- datamule-1.4.9/datamule/sec/submissions/downloader.py +0 -64
- datamule-1.4.9/datamule/submission.py +0 -197
- {datamule-1.4.9 → datamule-1.5.0}/datamule/__init__.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/config.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/__init__.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/document.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/__init__.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/atsn.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/cfportal.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/d.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/ex102_abs.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/ex99a_sdr.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/ex99c_sdr.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/ex99g_sdr.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/ex99i_sdr.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/information_table.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/nmfp.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/npx.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/onefourtyfour.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/ownership.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/proxy_voting_record.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/sbs.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/sbsef.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/schedule13.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/sdr.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/submission_metadata.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/ta.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/thirteenfhr.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/twentyfivense.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/mappings/twentyfourf2nt.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/processing.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/document/table.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/helper.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/index.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/package_updater.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/__init__.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/utils.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/seclibrary/__init__.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/seclibrary/bq.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/seclibrary/query.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule/sheet.py +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule.egg-info/SOURCES.txt +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule.egg-info/requires.txt +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/datamule.egg-info/top_level.txt +0 -0
- {datamule-1.4.9 → datamule-1.5.0}/setup.cfg +0 -0
@@ -125,7 +125,7 @@ class Portfolio:
|
|
125
125
|
# First query, just set the accession numbers
|
126
126
|
self.accession_numbers = new_accession_numbers
|
127
127
|
|
128
|
-
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=
|
128
|
+
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],requests_per_second=5, **kwargs):
|
129
129
|
if provider is None:
|
130
130
|
config = Config()
|
131
131
|
provider = config.get_default_source()
|
@@ -0,0 +1,32 @@
|
|
1
|
+
import os
|
2
|
+
from .streamer import stream
|
3
|
+
from secsgml import write_sgml_file_to_tar
|
4
|
+
from tqdm import tqdm
|
5
|
+
|
6
|
+
def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
|
7
|
+
requests_per_second=5, output_dir="filings", accession_numbers=None,
|
8
|
+
quiet=False, keep_document_types=[]):
|
9
|
+
# Make sure output directory exists
|
10
|
+
os.makedirs(output_dir, exist_ok=True)
|
11
|
+
|
12
|
+
pbar = tqdm(desc="Writing", unit=" submissions", disable=quiet,position=2)
|
13
|
+
|
14
|
+
# Create a wrapper for the download_callback that includes the output_dir
|
15
|
+
async def callback_wrapper(hit, content, cik, accno, url):
|
16
|
+
output_path = os.path.join(output_dir, accno.replace('-','') + '.tar')
|
17
|
+
write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=keep_document_types)
|
18
|
+
pbar.update(1)
|
19
|
+
|
20
|
+
|
21
|
+
# Call the stream function with our callback
|
22
|
+
return stream(
|
23
|
+
cik=cik,
|
24
|
+
name=name,
|
25
|
+
submission_type=submission_type,
|
26
|
+
filing_date=filing_date,
|
27
|
+
location=location,
|
28
|
+
requests_per_second=requests_per_second,
|
29
|
+
document_callback=callback_wrapper,
|
30
|
+
accession_numbers=accession_numbers,
|
31
|
+
quiet=quiet
|
32
|
+
)
|
@@ -15,6 +15,7 @@ from threading import Thread
|
|
15
15
|
from .query import query
|
16
16
|
from os import cpu_count
|
17
17
|
from ..submission import Submission
|
18
|
+
from secsgml import write_sgml_file_to_tar
|
18
19
|
|
19
20
|
|
20
21
|
|
@@ -73,7 +74,7 @@ class Downloader:
|
|
73
74
|
print(f"Failed to log error to {error_file}: {str(e)}")
|
74
75
|
|
75
76
|
class FileProcessor:
|
76
|
-
def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=
|
77
|
+
def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[]):
|
77
78
|
self.processing_queue = Queue(maxsize=queue_size)
|
78
79
|
self.should_stop = False
|
79
80
|
self.processing_workers = []
|
@@ -93,17 +94,9 @@ class Downloader:
|
|
93
94
|
|
94
95
|
def _process_file(self, item):
|
95
96
|
filename, content = item
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
|
100
|
-
self.pbar.update(1)
|
101
|
-
except Exception as e:
|
102
|
-
print(f"Exception {e} in {filename}")
|
103
|
-
accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
|
104
|
-
if os.path.exists(accession_dir):
|
105
|
-
shutil.rmtree(accession_dir)
|
106
|
-
self.downloader._log_error(self.output_dir, filename, str(e))
|
97
|
+
output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
|
98
|
+
write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types)
|
99
|
+
self.pbar.update(1)
|
107
100
|
|
108
101
|
def _processing_worker(self):
|
109
102
|
batch = []
|
@@ -211,7 +204,7 @@ class Downloader:
|
|
211
204
|
except Exception as e:
|
212
205
|
self._log_error(output_dir, filename, str(e))
|
213
206
|
|
214
|
-
async def process_batch(self, urls, output_dir, keep_document_types=
|
207
|
+
async def process_batch(self, urls, output_dir, keep_document_types=[]):
|
215
208
|
os.makedirs(output_dir, exist_ok=True)
|
216
209
|
|
217
210
|
with tqdm(total=len(urls), desc="Processing files") as pbar:
|
@@ -238,7 +231,7 @@ class Downloader:
|
|
238
231
|
processor.stop_workers()
|
239
232
|
decompression_pool.shutdown()
|
240
233
|
|
241
|
-
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=
|
234
|
+
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
|
242
235
|
"""
|
243
236
|
Query SEC filings and download/process them.
|
244
237
|
|
@@ -299,7 +292,7 @@ class Downloader:
|
|
299
292
|
self.loop.call_soon_threadsafe(self.loop.stop)
|
300
293
|
|
301
294
|
|
302
|
-
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=
|
295
|
+
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
|
303
296
|
"""
|
304
297
|
Query SEC filings and download/process them.
|
305
298
|
|
@@ -0,0 +1,215 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
import json
|
3
|
+
from .document.document import Document
|
4
|
+
from secsgml import parse_sgml_content_into_memory
|
5
|
+
import tarfile
|
6
|
+
import shutil
|
7
|
+
import zstandard as zstd
|
8
|
+
from io import BytesIO
|
9
|
+
import gzip
|
10
|
+
|
11
|
+
class Submission:
|
12
|
+
def __init__(self, path=None,sgml_content=None,keep_document_types=None):
|
13
|
+
if path is None and sgml_content is None:
|
14
|
+
raise ValueError("Either path or sgml_content must be provided")
|
15
|
+
if path is not None and sgml_content is not None:
|
16
|
+
raise ValueError("Only one of path or sgml_content must be provided")
|
17
|
+
|
18
|
+
if sgml_content is not None:
|
19
|
+
self.path = None
|
20
|
+
metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
|
21
|
+
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
|
22
|
+
# code dupe
|
23
|
+
self.accession = self.metadata.content['accession-number']
|
24
|
+
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
25
|
+
|
26
|
+
self.documents = []
|
27
|
+
filtered_metadata_documents = []
|
28
|
+
|
29
|
+
for idx,doc in enumerate(self.metadata.content['documents']):
|
30
|
+
type = doc.get('type')()
|
31
|
+
|
32
|
+
# Keep only specified types
|
33
|
+
if keep_document_types is not None and type not in keep_document_types:
|
34
|
+
continue
|
35
|
+
|
36
|
+
# write as txt if not declared
|
37
|
+
filename = doc.get('filename','.txt')
|
38
|
+
extension = Path(filename).suffix
|
39
|
+
self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
|
40
|
+
|
41
|
+
filtered_metadata_documents.append(doc)
|
42
|
+
|
43
|
+
self.metadata.content['documents'] = filtered_metadata_documents
|
44
|
+
|
45
|
+
if path is not None:
|
46
|
+
self.path = Path(path)
|
47
|
+
if self.path.suffix == '.tar':
|
48
|
+
with tarfile.open(self.path,'r') as tar:
|
49
|
+
metadata_obj = tar.extractfile('metadata.json')
|
50
|
+
metadata = json.loads(metadata_obj.read().decode('utf-8'))
|
51
|
+
|
52
|
+
# tarpath
|
53
|
+
metadata_path = f"{self.path}::metadata.json"
|
54
|
+
else:
|
55
|
+
metadata_path = self.path / 'metadata.json'
|
56
|
+
with metadata_path.open('r') as f:
|
57
|
+
metadata = json.load(f)
|
58
|
+
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
59
|
+
self.accession = self.metadata.content['accession-number']
|
60
|
+
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
def compress(self, compression=None, level=None, threshold=1048576):
|
65
|
+
if self.path is None:
|
66
|
+
raise ValueError("Compress requires path")
|
67
|
+
|
68
|
+
if compression is not None and compression not in ['gzip', 'zstd']:
|
69
|
+
raise ValueError("compression must be 'gzip' or 'zstd'")
|
70
|
+
|
71
|
+
# Create tar file (replace directory with .tar file)
|
72
|
+
tar_path = self.path.with_suffix('.tar')
|
73
|
+
|
74
|
+
with tarfile.open(tar_path, 'w') as tar:
|
75
|
+
# Add metadata.json first
|
76
|
+
metadata_path = self.path / 'metadata.json'
|
77
|
+
if metadata_path.exists():
|
78
|
+
tar.add(metadata_path, arcname='metadata.json')
|
79
|
+
|
80
|
+
# Add documents in order
|
81
|
+
for doc in self.metadata.content['documents']:
|
82
|
+
filename = doc.get('filename')
|
83
|
+
if filename is None:
|
84
|
+
filename = doc['sequence'] + '.txt'
|
85
|
+
|
86
|
+
file_path = self.path / filename
|
87
|
+
if file_path.exists():
|
88
|
+
file_size = file_path.stat().st_size
|
89
|
+
|
90
|
+
|
91
|
+
# Compress if compression specified and over threshold
|
92
|
+
if compression is not None and file_size >= threshold:
|
93
|
+
content = file_path.read_bytes()
|
94
|
+
|
95
|
+
if compression == 'gzip':
|
96
|
+
compressed_content = gzip.compress(content, compresslevel=level or 6)
|
97
|
+
compressed_filename = filename + '.gz'
|
98
|
+
else: # zstd
|
99
|
+
cctx = zstd.ZstdCompressor(level=level or 3)
|
100
|
+
compressed_content = cctx.compress(content)
|
101
|
+
compressed_filename = filename + '.zst'
|
102
|
+
|
103
|
+
# Add compressed file to tar
|
104
|
+
tarinfo = tarfile.TarInfo(name=compressed_filename)
|
105
|
+
tarinfo.size = len(compressed_content)
|
106
|
+
tar.addfile(tarinfo, BytesIO(compressed_content))
|
107
|
+
else:
|
108
|
+
# Add uncompressed file
|
109
|
+
tar.add(file_path, arcname=filename)
|
110
|
+
|
111
|
+
# Delete original folder
|
112
|
+
shutil.rmtree(self.path)
|
113
|
+
|
114
|
+
# Update path to point to new tar file
|
115
|
+
self.path = tar_path
|
116
|
+
|
117
|
+
def decompress(self):
|
118
|
+
if self.path is None:
|
119
|
+
raise ValueError("Decompress requires path")
|
120
|
+
elif self.path.suffix != '.tar':
|
121
|
+
raise ValueError("Can only decompress tar")
|
122
|
+
|
123
|
+
# Create output directory (path without .tar extension)
|
124
|
+
output_dir = self.path.with_suffix('')
|
125
|
+
output_dir.mkdir(exist_ok=True)
|
126
|
+
|
127
|
+
with tarfile.open(self.path, 'r') as tar:
|
128
|
+
for member in tar.getmembers():
|
129
|
+
if member.isfile():
|
130
|
+
content = tar.extractfile(member).read()
|
131
|
+
|
132
|
+
# Decompress if gzipped
|
133
|
+
if member.name.endswith('.gz'):
|
134
|
+
content = gzip.decompress(content)
|
135
|
+
output_path = output_dir / member.name[:-3] # Remove .gz extension
|
136
|
+
else:
|
137
|
+
output_path = output_dir / member.name
|
138
|
+
|
139
|
+
# Write to output directory
|
140
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
141
|
+
with output_path.open('wb') as f:
|
142
|
+
f.write(content)
|
143
|
+
|
144
|
+
# delete original file
|
145
|
+
self.path.unlink()
|
146
|
+
self.path = output_dir
|
147
|
+
|
148
|
+
def _load_document_by_index(self, idx):
|
149
|
+
"""Load a document by its index in the metadata documents list."""
|
150
|
+
doc = self.metadata.content['documents'][idx]
|
151
|
+
|
152
|
+
# If loaded from sgml_content, return pre-loaded document
|
153
|
+
if self.path is None:
|
154
|
+
return self.documents[idx]
|
155
|
+
|
156
|
+
# If loaded from path, load document on-demand
|
157
|
+
filename = doc.get('filename')
|
158
|
+
if filename is None:
|
159
|
+
filename = doc['sequence'] + '.txt'
|
160
|
+
|
161
|
+
document_path = self.path / filename
|
162
|
+
extension = document_path.suffix
|
163
|
+
|
164
|
+
if self.path.suffix == '.tar':
|
165
|
+
with tarfile.open(self.path, 'r') as tar:
|
166
|
+
# bandaid fix TODO
|
167
|
+
try:
|
168
|
+
content = tar.extractfile(filename).read()
|
169
|
+
except:
|
170
|
+
try:
|
171
|
+
content = tar.extractfile(filename+'.gz').read()
|
172
|
+
except:
|
173
|
+
try:
|
174
|
+
content = tar.extractfile(filename+'.zst').read()
|
175
|
+
except:
|
176
|
+
raise ValueError("Something went wrong with tar")
|
177
|
+
# Decompress if compressed
|
178
|
+
if filename.endswith('.gz'):
|
179
|
+
content = gzip.decompress(content)
|
180
|
+
elif filename.endswith('.zst'):
|
181
|
+
dctx = zstd.ZstdDecompressor()
|
182
|
+
content = dctx.decompress(content)
|
183
|
+
else:
|
184
|
+
with document_path.open('rb') as f:
|
185
|
+
content = f.read()
|
186
|
+
|
187
|
+
# Decode text files
|
188
|
+
if extension in ['.htm', '.html', '.txt', '.xml']:
|
189
|
+
content = content.decode('utf-8', errors='replace')
|
190
|
+
|
191
|
+
return Document(
|
192
|
+
type=doc['type'],
|
193
|
+
content=content,
|
194
|
+
extension=extension,
|
195
|
+
filing_date=self.filing_date,
|
196
|
+
accession=self.accession,
|
197
|
+
path=document_path
|
198
|
+
)
|
199
|
+
|
200
|
+
def __iter__(self):
|
201
|
+
"""Make Submission iterable by yielding all documents."""
|
202
|
+
for idx in range(len(self.metadata.content['documents'])):
|
203
|
+
yield self._load_document_by_index(idx)
|
204
|
+
|
205
|
+
def document_type(self, document_type):
|
206
|
+
"""Yield documents matching the specified type(s)."""
|
207
|
+
# Convert single document type to list for consistent handling
|
208
|
+
if isinstance(document_type, str):
|
209
|
+
document_types = [document_type]
|
210
|
+
else:
|
211
|
+
document_types = [item for item in document_type]
|
212
|
+
|
213
|
+
for idx, doc in enumerate(self.metadata.content['documents']):
|
214
|
+
if doc['type'] in document_types:
|
215
|
+
yield self._load_document_by_index(idx)
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="1.
|
35
|
+
version="1.5.0",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
@@ -1,64 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import json
|
3
|
-
from .streamer import stream
|
4
|
-
import aiofiles
|
5
|
-
from ...submission import Submission
|
6
|
-
|
7
|
-
async def download_callback(hit, content, cik, accno, url, output_dir="filings", keep_document_types=None):
|
8
|
-
"""Save downloaded SEC submission to disk."""
|
9
|
-
try:
|
10
|
-
# Create a Submission object directly from the content
|
11
|
-
# Note: the content needs to be decoded from bytes to string for the parser
|
12
|
-
submission = Submission(sgml_content=content,
|
13
|
-
keep_document_types=keep_document_types)
|
14
|
-
|
15
|
-
# Use the async save method to write the submission to disk
|
16
|
-
file_dir = await submission.save_async(output_dir=output_dir)
|
17
|
-
|
18
|
-
return file_dir
|
19
|
-
except Exception as e:
|
20
|
-
print(f"Error processing {accno}: {e}")
|
21
|
-
return None
|
22
|
-
|
23
|
-
def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
|
24
|
-
requests_per_second=5, output_dir="filings", accession_numbers=None,
|
25
|
-
quiet=False, keep_document_types=None):
|
26
|
-
"""
|
27
|
-
Download SEC EDGAR filings and extract their documents.
|
28
|
-
|
29
|
-
Parameters:
|
30
|
-
- cik: CIK number(s) to query for
|
31
|
-
- submission_type: Filing type(s) to query for (default: 10-K)
|
32
|
-
- filing_date: Date or date range to query for
|
33
|
-
- location: Location code to filter by (e.g., 'CA' for California)
|
34
|
-
- name: Company name to search for (alternative to providing CIK)
|
35
|
-
- requests_per_second: Rate limit for SEC requests
|
36
|
-
- output_dir: Directory to save documents
|
37
|
-
- accession_numbers: Optional list of accession numbers to filter by
|
38
|
-
- quiet: Whether to suppress progress output
|
39
|
-
- keep_document_types: Optional list of document types to keep (e.g. ['10-K', 'EX-10.1'])
|
40
|
-
|
41
|
-
Returns:
|
42
|
-
- List of all document paths processed
|
43
|
-
"""
|
44
|
-
# Make sure output directory exists
|
45
|
-
os.makedirs(output_dir, exist_ok=True)
|
46
|
-
|
47
|
-
# Create a wrapper for the download_callback that includes the output_dir
|
48
|
-
async def callback_wrapper(hit, content, cik, accno, url):
|
49
|
-
return await download_callback(hit, content, cik, accno, url,
|
50
|
-
output_dir=output_dir,
|
51
|
-
keep_document_types=keep_document_types)
|
52
|
-
|
53
|
-
# Call the stream function with our callback
|
54
|
-
return stream(
|
55
|
-
cik=cik,
|
56
|
-
name=name,
|
57
|
-
submission_type=submission_type,
|
58
|
-
filing_date=filing_date,
|
59
|
-
location=location,
|
60
|
-
requests_per_second=requests_per_second,
|
61
|
-
document_callback=callback_wrapper,
|
62
|
-
accession_numbers=accession_numbers,
|
63
|
-
quiet=quiet
|
64
|
-
)
|
@@ -1,197 +0,0 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
import json
|
3
|
-
from .document.document import Document
|
4
|
-
from secsgml import parse_sgml_content_into_memory
|
5
|
-
import os
|
6
|
-
import aiofiles
|
7
|
-
|
8
|
-
# TODO add .tar path
|
9
|
-
class Submission:
|
10
|
-
def __init__(self, path=None,sgml_content=None,keep_document_types=None):
|
11
|
-
if path is None and sgml_content is None:
|
12
|
-
raise ValueError("Either path or sgml_content must be provided")
|
13
|
-
if path is not None and sgml_content is not None:
|
14
|
-
raise ValueError("Only one of path or sgml_content must be provided")
|
15
|
-
|
16
|
-
if sgml_content is not None:
|
17
|
-
self.path = None
|
18
|
-
metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
|
19
|
-
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
|
20
|
-
# code dupe
|
21
|
-
self.accession = self.metadata.content['accession-number']
|
22
|
-
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
23
|
-
|
24
|
-
self.documents = []
|
25
|
-
filtered_metadata_documents = []
|
26
|
-
|
27
|
-
for idx,doc in enumerate(self.metadata.content['documents']):
|
28
|
-
type = doc.get('type').upper()
|
29
|
-
|
30
|
-
# Keep only specified types
|
31
|
-
if keep_document_types is not None and type not in keep_document_types:
|
32
|
-
continue
|
33
|
-
|
34
|
-
# write as txt if not declared
|
35
|
-
filename = doc.get('filename','.txt')
|
36
|
-
extension = Path(filename).suffix
|
37
|
-
self.documents.append(Document(type=type.upper(), content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
|
38
|
-
|
39
|
-
filtered_metadata_documents.append(doc)
|
40
|
-
|
41
|
-
self.metadata.content['documents'] = filtered_metadata_documents
|
42
|
-
|
43
|
-
if path is not None:
|
44
|
-
self.path = Path(path)
|
45
|
-
metadata_path = self.path / 'metadata.json'
|
46
|
-
with metadata_path.open('r') as f:
|
47
|
-
metadata = json.load(f)
|
48
|
-
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
49
|
-
|
50
|
-
# Code dupe
|
51
|
-
self.accession = self.metadata.content['accession-number']
|
52
|
-
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
def document_type(self, document_type):
|
58
|
-
# Convert single document type to list for consistent handling
|
59
|
-
if isinstance(document_type, str):
|
60
|
-
document_types = [document_type.lower()]
|
61
|
-
else:
|
62
|
-
document_types = [item.lower() for item in document_type]
|
63
|
-
|
64
|
-
for idx,doc in enumerate(self.metadata.content['documents']):
|
65
|
-
if doc['type'] in document_types:
|
66
|
-
|
67
|
-
# if loaded from path
|
68
|
-
if self.path is not None:
|
69
|
-
filename = doc.get('filename')
|
70
|
-
# oh we need handling here for sequences case
|
71
|
-
if filename is None:
|
72
|
-
filename = doc['sequence'] + '.txt'
|
73
|
-
|
74
|
-
document_path = self.path / filename
|
75
|
-
extension = document_path.suffix
|
76
|
-
|
77
|
-
with document_path.open('rb') as f:
|
78
|
-
content = f.read()
|
79
|
-
|
80
|
-
if extension in ['.htm','.html','.txt','.xml']:
|
81
|
-
content = content.decode('utf-8', errors='replace')
|
82
|
-
|
83
|
-
yield Document(type=doc['type'].upper(), content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
|
84
|
-
# if loaded from sgml_content
|
85
|
-
else:
|
86
|
-
yield self.documents[idx]
|
87
|
-
|
88
|
-
|
89
|
-
def __iter__(self):
|
90
|
-
for idx,doc in enumerate(self.metadata.content['documents']):
|
91
|
-
# if loaded from path
|
92
|
-
if self.path is not None:
|
93
|
-
filename = doc.get('filename')
|
94
|
-
|
95
|
-
# oh we need handling here for sequences case
|
96
|
-
if filename is None:
|
97
|
-
filename = doc['sequence'] + '.txt'
|
98
|
-
|
99
|
-
document_path = self.path / filename
|
100
|
-
extension = document_path.suffix
|
101
|
-
|
102
|
-
# check if the file exists
|
103
|
-
if document_path.exists():
|
104
|
-
with document_path.open('rb') as f:
|
105
|
-
content = f.read()
|
106
|
-
|
107
|
-
if extension in ['.htm','.html','.txt','.xml']:
|
108
|
-
content = content.decode('utf-8', errors='replace')
|
109
|
-
|
110
|
-
yield Document(type=doc['type'].upper(), content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
|
111
|
-
else:
|
112
|
-
print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
|
113
|
-
|
114
|
-
# if loaded from sgml_content
|
115
|
-
else:
|
116
|
-
yield self.documents[idx]
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
def save(self, output_dir="filings"):
|
122
|
-
file_dir = Path(output_dir) / str(self.accession)
|
123
|
-
file_dir.mkdir(parents=True, exist_ok=True)
|
124
|
-
|
125
|
-
metadata_path = file_dir / "metadata.json"
|
126
|
-
with open(metadata_path, 'w') as f:
|
127
|
-
json.dump(self.metadata.content, f, indent=4)
|
128
|
-
|
129
|
-
for idx, doc in enumerate(self.metadata.content['documents']):
|
130
|
-
filename = doc.get('filename')
|
131
|
-
if filename is None:
|
132
|
-
filename = f"{doc.get('sequence')}.txt"
|
133
|
-
|
134
|
-
doc_path = file_dir / filename
|
135
|
-
|
136
|
-
if self.path is not None:
|
137
|
-
if hasattr(self, 'documents') and self.documents:
|
138
|
-
content = self.documents[idx].content
|
139
|
-
else:
|
140
|
-
orig_doc_path = self.path / filename
|
141
|
-
if orig_doc_path.exists():
|
142
|
-
with open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
|
143
|
-
content = f.read()
|
144
|
-
else:
|
145
|
-
print(f"Warning: File {orig_doc_path} does not exist, skipping.")
|
146
|
-
continue
|
147
|
-
else:
|
148
|
-
content = self.documents[idx].content
|
149
|
-
|
150
|
-
if isinstance(content, bytes):
|
151
|
-
with open(doc_path, 'wb') as f:
|
152
|
-
f.write(content)
|
153
|
-
else:
|
154
|
-
with open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
|
155
|
-
f.write(content)
|
156
|
-
|
157
|
-
return file_dir
|
158
|
-
|
159
|
-
async def save_async(self, output_dir="filings"):
|
160
|
-
file_dir = Path(output_dir) / str(self.accession)
|
161
|
-
os.makedirs(file_dir, exist_ok=True)
|
162
|
-
|
163
|
-
metadata_path = file_dir / "metadata.json"
|
164
|
-
async with aiofiles.open(metadata_path, 'w') as f:
|
165
|
-
await f.write(json.dumps(self.metadata.content, indent=4))
|
166
|
-
|
167
|
-
for idx, doc in enumerate(self.metadata.content['documents']):
|
168
|
-
filename = doc.get('filename')
|
169
|
-
# oh we need handling here for sequences case
|
170
|
-
if filename is None:
|
171
|
-
filename = doc['sequence'] + '.txt'
|
172
|
-
|
173
|
-
|
174
|
-
doc_path = file_dir / filename
|
175
|
-
|
176
|
-
if self.path is not None:
|
177
|
-
if hasattr(self, 'documents') and self.documents:
|
178
|
-
content = self.documents[idx].content
|
179
|
-
else:
|
180
|
-
orig_doc_path = self.path / filename
|
181
|
-
if orig_doc_path.exists():
|
182
|
-
async with aiofiles.open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
|
183
|
-
content = await f.read()
|
184
|
-
else:
|
185
|
-
print(f"Warning: File {orig_doc_path} does not exist, skipping.")
|
186
|
-
continue
|
187
|
-
else:
|
188
|
-
content = self.documents[idx].content
|
189
|
-
|
190
|
-
if isinstance(content, bytes):
|
191
|
-
async with aiofiles.open(doc_path, 'wb') as f:
|
192
|
-
await f.write(content)
|
193
|
-
else:
|
194
|
-
async with aiofiles.open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
|
195
|
-
await f.write(content)
|
196
|
-
|
197
|
-
return file_dir
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|