datamule 1.4.6__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {datamule-1.4.6 → datamule-1.5.0}/PKG-INFO +1 -1
  2. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/document.py +3 -14
  3. {datamule-1.4.6 → datamule-1.5.0}/datamule/portfolio.py +1 -1
  4. datamule-1.5.0/datamule/sec/submissions/downloader.py +32 -0
  5. {datamule-1.4.6 → datamule-1.5.0}/datamule/seclibrary/downloader.py +8 -15
  6. datamule-1.5.0/datamule/submission.py +215 -0
  7. {datamule-1.4.6 → datamule-1.5.0}/datamule.egg-info/PKG-INFO +1 -1
  8. {datamule-1.4.6 → datamule-1.5.0}/setup.py +1 -1
  9. datamule-1.4.6/datamule/sec/submissions/downloader.py +0 -64
  10. datamule-1.4.6/datamule/submission.py +0 -261
  11. {datamule-1.4.6 → datamule-1.5.0}/datamule/__init__.py +0 -0
  12. {datamule-1.4.6 → datamule-1.5.0}/datamule/config.py +0 -0
  13. {datamule-1.4.6 → datamule-1.5.0}/datamule/data/listed_filer_metadata.csv +0 -0
  14. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/__init__.py +0 -0
  15. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/__init__.py +0 -0
  16. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/atsn.py +0 -0
  17. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/cfportal.py +0 -0
  18. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/d.py +0 -0
  19. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/ex102_abs.py +0 -0
  20. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/ex99a_sdr.py +0 -0
  21. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/ex99c_sdr.py +0 -0
  22. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/ex99g_sdr.py +0 -0
  23. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/ex99i_sdr.py +0 -0
  24. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/information_table.py +0 -0
  25. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/nmfp.py +0 -0
  26. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/npx.py +0 -0
  27. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/onefourtyfour.py +0 -0
  28. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/ownership.py +0 -0
  29. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/proxy_voting_record.py +0 -0
  30. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/sbs.py +0 -0
  31. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/sbsef.py +0 -0
  32. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/schedule13.py +0 -0
  33. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/sdr.py +0 -0
  34. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/submission_metadata.py +0 -0
  35. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/ta.py +0 -0
  36. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/thirteenfhr.py +0 -0
  37. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/twentyfivense.py +0 -0
  38. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/mappings/twentyfourf2nt.py +0 -0
  39. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/processing.py +0 -0
  40. {datamule-1.4.6 → datamule-1.5.0}/datamule/document/table.py +0 -0
  41. {datamule-1.4.6 → datamule-1.5.0}/datamule/helper.py +0 -0
  42. {datamule-1.4.6 → datamule-1.5.0}/datamule/index.py +0 -0
  43. {datamule-1.4.6 → datamule-1.5.0}/datamule/mapping_dicts/__init__.py +0 -0
  44. {datamule-1.4.6 → datamule-1.5.0}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  45. {datamule-1.4.6 → datamule-1.5.0}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  46. {datamule-1.4.6 → datamule-1.5.0}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  47. {datamule-1.4.6 → datamule-1.5.0}/datamule/package_updater.py +0 -0
  48. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/__init__.py +0 -0
  49. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/infrastructure/__init__.py +0 -0
  50. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  51. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/submissions/__init__.py +0 -0
  52. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/submissions/eftsquery.py +0 -0
  53. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/submissions/monitor.py +0 -0
  54. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/submissions/streamer.py +0 -0
  55. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/submissions/textsearch.py +0 -0
  56. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/utils.py +0 -0
  57. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/xbrl/__init__.py +0 -0
  58. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  59. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  60. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  61. {datamule-1.4.6 → datamule-1.5.0}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  62. {datamule-1.4.6 → datamule-1.5.0}/datamule/seclibrary/__init__.py +0 -0
  63. {datamule-1.4.6 → datamule-1.5.0}/datamule/seclibrary/bq.py +0 -0
  64. {datamule-1.4.6 → datamule-1.5.0}/datamule/seclibrary/query.py +0 -0
  65. {datamule-1.4.6 → datamule-1.5.0}/datamule/sheet.py +0 -0
  66. {datamule-1.4.6 → datamule-1.5.0}/datamule.egg-info/SOURCES.txt +0 -0
  67. {datamule-1.4.6 → datamule-1.5.0}/datamule.egg-info/dependency_links.txt +0 -0
  68. {datamule-1.4.6 → datamule-1.5.0}/datamule.egg-info/requires.txt +0 -0
  69. {datamule-1.4.6 → datamule-1.5.0}/datamule.egg-info/top_level.txt +0 -0
  70. {datamule-1.4.6 → datamule-1.5.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.4.6
3
+ Version: 1.5.0
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -11,19 +11,7 @@ from selectolax.parser import HTMLParser
11
11
  from .processing import process_tabular_data
12
12
  from pathlib import Path
13
13
  import webbrowser
14
-
15
- def convert_bytes_keys(obj):
16
- if isinstance(obj, dict):
17
- return {
18
- (k.decode('utf-8').lower() if isinstance(k, bytes) else k): convert_bytes_keys(v)
19
- for k, v in obj.items()
20
- }
21
- elif isinstance(obj, list):
22
- return [convert_bytes_keys(item) for item in obj]
23
- elif isinstance(obj, bytes):
24
- return obj.decode('utf-8').lower()
25
- else:
26
- return obj
14
+ from secsgml.utils import bytes_to_str
27
15
 
28
16
  class Document:
29
17
  def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -34,7 +22,8 @@ class Document:
34
22
  self.filing_date = filing_date
35
23
 
36
24
  if self.type == 'submission_metadata':
37
- self.content = convert_bytes_keys(content)
25
+ # this converts to lower
26
+ self.content = bytes_to_str(content)
38
27
  else:
39
28
  self.content = content
40
29
 
@@ -125,7 +125,7 @@ class Portfolio:
125
125
  # First query, just set the accession numbers
126
126
  self.accession_numbers = new_accession_numbers
127
127
 
128
- def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=None,requests_per_second=5, **kwargs):
128
+ def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],requests_per_second=5, **kwargs):
129
129
  if provider is None:
130
130
  config = Config()
131
131
  provider = config.get_default_source()
@@ -0,0 +1,32 @@
1
+ import os
2
+ from .streamer import stream
3
+ from secsgml import write_sgml_file_to_tar
4
+ from tqdm import tqdm
5
+
6
+ def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
7
+ requests_per_second=5, output_dir="filings", accession_numbers=None,
8
+ quiet=False, keep_document_types=[]):
9
+ # Make sure output directory exists
10
+ os.makedirs(output_dir, exist_ok=True)
11
+
12
+ pbar = tqdm(desc="Writing", unit=" submissions", disable=quiet,position=2)
13
+
14
+ # Create a wrapper for the download_callback that includes the output_dir
15
+ async def callback_wrapper(hit, content, cik, accno, url):
16
+ output_path = os.path.join(output_dir, accno.replace('-','') + '.tar')
17
+ write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=keep_document_types)
18
+ pbar.update(1)
19
+
20
+
21
+ # Call the stream function with our callback
22
+ return stream(
23
+ cik=cik,
24
+ name=name,
25
+ submission_type=submission_type,
26
+ filing_date=filing_date,
27
+ location=location,
28
+ requests_per_second=requests_per_second,
29
+ document_callback=callback_wrapper,
30
+ accession_numbers=accession_numbers,
31
+ quiet=quiet
32
+ )
@@ -15,6 +15,7 @@ from threading import Thread
15
15
  from .query import query
16
16
  from os import cpu_count
17
17
  from ..submission import Submission
18
+ from secsgml import write_sgml_file_to_tar
18
19
 
19
20
 
20
21
 
@@ -73,7 +74,7 @@ class Downloader:
73
74
  print(f"Failed to log error to {error_file}: {str(e)}")
74
75
 
75
76
  class FileProcessor:
76
- def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=None):
77
+ def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[]):
77
78
  self.processing_queue = Queue(maxsize=queue_size)
78
79
  self.should_stop = False
79
80
  self.processing_workers = []
@@ -93,17 +94,9 @@ class Downloader:
93
94
 
94
95
  def _process_file(self, item):
95
96
  filename, content = item
96
- try:
97
- submission = Submission(sgml_content=content, keep_document_types=self.keep_document_types)
98
- # Use the shared event loop to run save_async
99
- self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
100
- self.pbar.update(1)
101
- except Exception as e:
102
- print(f"Exception {e} in {filename}")
103
- accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
104
- if os.path.exists(accession_dir):
105
- shutil.rmtree(accession_dir)
106
- self.downloader._log_error(self.output_dir, filename, str(e))
97
+ output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
98
+ write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types)
99
+ self.pbar.update(1)
107
100
 
108
101
  def _processing_worker(self):
109
102
  batch = []
@@ -211,7 +204,7 @@ class Downloader:
211
204
  except Exception as e:
212
205
  self._log_error(output_dir, filename, str(e))
213
206
 
214
- async def process_batch(self, urls, output_dir, keep_document_types=None):
207
+ async def process_batch(self, urls, output_dir, keep_document_types=[]):
215
208
  os.makedirs(output_dir, exist_ok=True)
216
209
 
217
210
  with tqdm(total=len(urls), desc="Processing files") as pbar:
@@ -238,7 +231,7 @@ class Downloader:
238
231
  processor.stop_workers()
239
232
  decompression_pool.shutdown()
240
233
 
241
- def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
234
+ def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
242
235
  """
243
236
  Query SEC filings and download/process them.
244
237
 
@@ -299,7 +292,7 @@ class Downloader:
299
292
  self.loop.call_soon_threadsafe(self.loop.stop)
300
293
 
301
294
 
302
- def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
295
+ def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
303
296
  """
304
297
  Query SEC filings and download/process them.
305
298
 
@@ -0,0 +1,215 @@
1
+ from pathlib import Path
2
+ import json
3
+ from .document.document import Document
4
+ from secsgml import parse_sgml_content_into_memory
5
+ import tarfile
6
+ import shutil
7
+ import zstandard as zstd
8
+ from io import BytesIO
9
+ import gzip
10
+
11
+ class Submission:
12
+ def __init__(self, path=None,sgml_content=None,keep_document_types=None):
13
+ if path is None and sgml_content is None:
14
+ raise ValueError("Either path or sgml_content must be provided")
15
+ if path is not None and sgml_content is not None:
16
+ raise ValueError("Only one of path or sgml_content must be provided")
17
+
18
+ if sgml_content is not None:
19
+ self.path = None
20
+ metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
21
+ self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
22
+ # code dupe
23
+ self.accession = self.metadata.content['accession-number']
24
+ self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
25
+
26
+ self.documents = []
27
+ filtered_metadata_documents = []
28
+
29
+ for idx,doc in enumerate(self.metadata.content['documents']):
30
+ type = doc.get('type')()
31
+
32
+ # Keep only specified types
33
+ if keep_document_types is not None and type not in keep_document_types:
34
+ continue
35
+
36
+ # write as txt if not declared
37
+ filename = doc.get('filename','.txt')
38
+ extension = Path(filename).suffix
39
+ self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
40
+
41
+ filtered_metadata_documents.append(doc)
42
+
43
+ self.metadata.content['documents'] = filtered_metadata_documents
44
+
45
+ if path is not None:
46
+ self.path = Path(path)
47
+ if self.path.suffix == '.tar':
48
+ with tarfile.open(self.path,'r') as tar:
49
+ metadata_obj = tar.extractfile('metadata.json')
50
+ metadata = json.loads(metadata_obj.read().decode('utf-8'))
51
+
52
+ # tarpath
53
+ metadata_path = f"{self.path}::metadata.json"
54
+ else:
55
+ metadata_path = self.path / 'metadata.json'
56
+ with metadata_path.open('r') as f:
57
+ metadata = json.load(f)
58
+ self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
59
+ self.accession = self.metadata.content['accession-number']
60
+ self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
61
+
62
+
63
+
64
+ def compress(self, compression=None, level=None, threshold=1048576):
65
+ if self.path is None:
66
+ raise ValueError("Compress requires path")
67
+
68
+ if compression is not None and compression not in ['gzip', 'zstd']:
69
+ raise ValueError("compression must be 'gzip' or 'zstd'")
70
+
71
+ # Create tar file (replace directory with .tar file)
72
+ tar_path = self.path.with_suffix('.tar')
73
+
74
+ with tarfile.open(tar_path, 'w') as tar:
75
+ # Add metadata.json first
76
+ metadata_path = self.path / 'metadata.json'
77
+ if metadata_path.exists():
78
+ tar.add(metadata_path, arcname='metadata.json')
79
+
80
+ # Add documents in order
81
+ for doc in self.metadata.content['documents']:
82
+ filename = doc.get('filename')
83
+ if filename is None:
84
+ filename = doc['sequence'] + '.txt'
85
+
86
+ file_path = self.path / filename
87
+ if file_path.exists():
88
+ file_size = file_path.stat().st_size
89
+
90
+
91
+ # Compress if compression specified and over threshold
92
+ if compression is not None and file_size >= threshold:
93
+ content = file_path.read_bytes()
94
+
95
+ if compression == 'gzip':
96
+ compressed_content = gzip.compress(content, compresslevel=level or 6)
97
+ compressed_filename = filename + '.gz'
98
+ else: # zstd
99
+ cctx = zstd.ZstdCompressor(level=level or 3)
100
+ compressed_content = cctx.compress(content)
101
+ compressed_filename = filename + '.zst'
102
+
103
+ # Add compressed file to tar
104
+ tarinfo = tarfile.TarInfo(name=compressed_filename)
105
+ tarinfo.size = len(compressed_content)
106
+ tar.addfile(tarinfo, BytesIO(compressed_content))
107
+ else:
108
+ # Add uncompressed file
109
+ tar.add(file_path, arcname=filename)
110
+
111
+ # Delete original folder
112
+ shutil.rmtree(self.path)
113
+
114
+ # Update path to point to new tar file
115
+ self.path = tar_path
116
+
117
+ def decompress(self):
118
+ if self.path is None:
119
+ raise ValueError("Decompress requires path")
120
+ elif self.path.suffix != '.tar':
121
+ raise ValueError("Can only decompress tar")
122
+
123
+ # Create output directory (path without .tar extension)
124
+ output_dir = self.path.with_suffix('')
125
+ output_dir.mkdir(exist_ok=True)
126
+
127
+ with tarfile.open(self.path, 'r') as tar:
128
+ for member in tar.getmembers():
129
+ if member.isfile():
130
+ content = tar.extractfile(member).read()
131
+
132
+ # Decompress if gzipped
133
+ if member.name.endswith('.gz'):
134
+ content = gzip.decompress(content)
135
+ output_path = output_dir / member.name[:-3] # Remove .gz extension
136
+ else:
137
+ output_path = output_dir / member.name
138
+
139
+ # Write to output directory
140
+ output_path.parent.mkdir(parents=True, exist_ok=True)
141
+ with output_path.open('wb') as f:
142
+ f.write(content)
143
+
144
+ # delete original file
145
+ self.path.unlink()
146
+ self.path = output_dir
147
+
148
+ def _load_document_by_index(self, idx):
149
+ """Load a document by its index in the metadata documents list."""
150
+ doc = self.metadata.content['documents'][idx]
151
+
152
+ # If loaded from sgml_content, return pre-loaded document
153
+ if self.path is None:
154
+ return self.documents[idx]
155
+
156
+ # If loaded from path, load document on-demand
157
+ filename = doc.get('filename')
158
+ if filename is None:
159
+ filename = doc['sequence'] + '.txt'
160
+
161
+ document_path = self.path / filename
162
+ extension = document_path.suffix
163
+
164
+ if self.path.suffix == '.tar':
165
+ with tarfile.open(self.path, 'r') as tar:
166
+ # bandaid fix TODO
167
+ try:
168
+ content = tar.extractfile(filename).read()
169
+ except:
170
+ try:
171
+ content = tar.extractfile(filename+'.gz').read()
172
+ except:
173
+ try:
174
+ content = tar.extractfile(filename+'.zst').read()
175
+ except:
176
+ raise ValueError("Something went wrong with tar")
177
+ # Decompress if compressed
178
+ if filename.endswith('.gz'):
179
+ content = gzip.decompress(content)
180
+ elif filename.endswith('.zst'):
181
+ dctx = zstd.ZstdDecompressor()
182
+ content = dctx.decompress(content)
183
+ else:
184
+ with document_path.open('rb') as f:
185
+ content = f.read()
186
+
187
+ # Decode text files
188
+ if extension in ['.htm', '.html', '.txt', '.xml']:
189
+ content = content.decode('utf-8', errors='replace')
190
+
191
+ return Document(
192
+ type=doc['type'],
193
+ content=content,
194
+ extension=extension,
195
+ filing_date=self.filing_date,
196
+ accession=self.accession,
197
+ path=document_path
198
+ )
199
+
200
+ def __iter__(self):
201
+ """Make Submission iterable by yielding all documents."""
202
+ for idx in range(len(self.metadata.content['documents'])):
203
+ yield self._load_document_by_index(idx)
204
+
205
+ def document_type(self, document_type):
206
+ """Yield documents matching the specified type(s)."""
207
+ # Convert single document type to list for consistent handling
208
+ if isinstance(document_type, str):
209
+ document_types = [document_type]
210
+ else:
211
+ document_types = [item for item in document_type]
212
+
213
+ for idx, doc in enumerate(self.metadata.content['documents']):
214
+ if doc['type'] in document_types:
215
+ yield self._load_document_by_index(idx)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.4.6
3
+ Version: 1.5.0
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="1.4.6",
35
+ version="1.5.0",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
@@ -1,64 +0,0 @@
1
- import os
2
- import json
3
- from .streamer import stream
4
- import aiofiles
5
- from ...submission import Submission
6
-
7
- async def download_callback(hit, content, cik, accno, url, output_dir="filings", keep_document_types=None):
8
- """Save downloaded SEC submission to disk."""
9
- try:
10
- # Create a Submission object directly from the content
11
- # Note: the content needs to be decoded from bytes to string for the parser
12
- submission = Submission(sgml_content=content,
13
- keep_document_types=keep_document_types)
14
-
15
- # Use the async save method to write the submission to disk
16
- file_dir = await submission.save_async(output_dir=output_dir)
17
-
18
- return file_dir
19
- except Exception as e:
20
- print(f"Error processing {accno}: {e}")
21
- return None
22
-
23
- def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
24
- requests_per_second=5, output_dir="filings", accession_numbers=None,
25
- quiet=False, keep_document_types=None):
26
- """
27
- Download SEC EDGAR filings and extract their documents.
28
-
29
- Parameters:
30
- - cik: CIK number(s) to query for
31
- - submission_type: Filing type(s) to query for (default: 10-K)
32
- - filing_date: Date or date range to query for
33
- - location: Location code to filter by (e.g., 'CA' for California)
34
- - name: Company name to search for (alternative to providing CIK)
35
- - requests_per_second: Rate limit for SEC requests
36
- - output_dir: Directory to save documents
37
- - accession_numbers: Optional list of accession numbers to filter by
38
- - quiet: Whether to suppress progress output
39
- - keep_document_types: Optional list of document types to keep (e.g. ['10-K', 'EX-10.1'])
40
-
41
- Returns:
42
- - List of all document paths processed
43
- """
44
- # Make sure output directory exists
45
- os.makedirs(output_dir, exist_ok=True)
46
-
47
- # Create a wrapper for the download_callback that includes the output_dir
48
- async def callback_wrapper(hit, content, cik, accno, url):
49
- return await download_callback(hit, content, cik, accno, url,
50
- output_dir=output_dir,
51
- keep_document_types=keep_document_types)
52
-
53
- # Call the stream function with our callback
54
- return stream(
55
- cik=cik,
56
- name=name,
57
- submission_type=submission_type,
58
- filing_date=filing_date,
59
- location=location,
60
- requests_per_second=requests_per_second,
61
- document_callback=callback_wrapper,
62
- accession_numbers=accession_numbers,
63
- quiet=quiet
64
- )
@@ -1,261 +0,0 @@
1
- from pathlib import Path
2
- import json
3
- from .document.document import Document
4
- from secsgml import parse_sgml_content_into_memory
5
- import os
6
- import aiofiles
7
- import tempfile
8
-
9
-
10
- # # NEW CODE YAY. probably will remove
11
-
12
- # def save_metadata_atomically(metadata_file_path, metadata_content):
13
- # """Save metadata to a JSONL file atomically, works on any filesystem"""
14
-
15
- # # Create directory if it doesn't exist
16
- # os.makedirs(os.path.dirname(metadata_file_path), exist_ok=True)
17
-
18
- # # Format the JSON with newline
19
- # json_str = json.dumps(metadata_content, indent=4) + "\n"
20
-
21
- # # Write complete content to a temporary file first
22
- # fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(metadata_file_path))
23
- # try:
24
- # with os.fdopen(fd, 'w') as temp_file:
25
- # temp_file.write(json_str)
26
- # temp_file.flush()
27
- # os.fsync(temp_file.fileno()) # Force write to disk
28
-
29
- # # Append the temporary file to the main file
30
- # with open(metadata_file_path, 'a') as target_file:
31
- # with open(temp_path, 'r') as temp_read:
32
- # content = temp_read.read()
33
- # target_file.write(content)
34
- # target_file.flush()
35
- # os.fsync(target_file.fileno()) # Force write to disk
36
- # finally:
37
- # # Clean up the temporary file
38
- # if os.path.exists(temp_path):
39
- # os.unlink(temp_path)
40
-
41
- # async def save_metadata_atomically_async(metadata_file_path, metadata_content):
42
- # """Save metadata to a JSONL file atomically in async mode"""
43
-
44
- # # Create directory if it doesn't exist
45
- # os.makedirs(os.path.dirname(metadata_file_path), exist_ok=True)
46
-
47
- # # Format the JSON with newline
48
- # json_str = json.dumps(metadata_content, indent=4) + "\n"
49
-
50
- # # Write to a temporary file first
51
- # fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(metadata_file_path))
52
- # os.close(fd) # Close the file descriptor
53
-
54
- # try:
55
- # async with aiofiles.open(temp_path, 'w') as temp_file:
56
- # await temp_file.write(json_str)
57
- # await temp_file.flush()
58
-
59
- # # Append the temporary file to the main file
60
- # async with aiofiles.open(metadata_file_path, 'a') as target_file:
61
- # async with aiofiles.open(temp_path, 'r') as temp_read:
62
- # content = await temp_read.read()
63
- # await target_file.write(content)
64
- # await target_file.flush()
65
- # finally:
66
- # # Clean up the temporary file
67
- # if os.path.exists(temp_path):
68
- # os.unlink(temp_path)
69
-
70
- # # END OF NEW CODE
71
-
72
-
73
- class Submission:
74
- def __init__(self, path=None,sgml_content=None,keep_document_types=None):
75
- if path is None and sgml_content is None:
76
- raise ValueError("Either path or sgml_content must be provided")
77
- if path is not None and sgml_content is not None:
78
- raise ValueError("Only one of path or sgml_content must be provided")
79
-
80
- if sgml_content is not None:
81
- self.path = None
82
- metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
83
- self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
84
- # code dupe
85
- self.accession = self.metadata.content['accession-number']
86
- self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
87
-
88
- self.documents = []
89
- filtered_metadata_documents = []
90
-
91
- for idx,doc in enumerate(self.metadata.content['documents']):
92
- type = doc.get('type')
93
-
94
- # Keep only specified types
95
- if keep_document_types is not None and type not in keep_document_types:
96
- continue
97
-
98
- # write as txt if not declared
99
- filename = doc.get('filename','.txt')
100
- extension = Path(filename).suffix
101
- self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
102
-
103
- filtered_metadata_documents.append(doc)
104
-
105
- self.metadata.content['documents'] = filtered_metadata_documents
106
-
107
- if path is not None:
108
- self.path = Path(path)
109
- metadata_path = self.path / 'metadata.json'
110
- with metadata_path.open('r') as f:
111
- metadata = json.load(f)
112
- self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
113
-
114
- # Code dupe
115
- self.accession = self.metadata.content['accession-number']
116
- self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
117
-
118
-
119
-
120
-
121
- def document_type(self, document_type):
122
- # Convert single document type to list for consistent handling
123
- if isinstance(document_type, str):
124
- document_types = [document_type]
125
- else:
126
- document_types = document_type
127
-
128
- for idx,doc in enumerate(self.metadata.content['documents']):
129
- if doc['type'] in document_types:
130
-
131
- # if loaded from path
132
- if self.path is not None:
133
- filename = doc.get('filename')
134
- # oh we need handling here for sequences case
135
- if filename is None:
136
- filename = doc['sequence'] + '.txt'
137
-
138
- document_path = self.path / filename
139
- extension = document_path.suffix
140
-
141
- with document_path.open('rb') as f:
142
- content = f.read()
143
-
144
- if extension in ['.htm','.html','.txt','.xml']:
145
- content = content.decode('utf-8', errors='replace')
146
-
147
- yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
148
- # if loaded from sgml_content
149
- else:
150
- yield self.documents[idx]
151
-
152
-
153
- def __iter__(self):
154
- for idx,doc in enumerate(self.metadata.content['documents']):
155
- # if loaded from path
156
- if self.path is not None:
157
- filename = doc.get('filename')
158
-
159
- # oh we need handling here for sequences case
160
- if filename is None:
161
- filename = doc['sequence'] + '.txt'
162
-
163
- document_path = self.path / filename
164
- extension = document_path.suffix
165
-
166
- # check if the file exists
167
- if document_path.exists():
168
- with document_path.open('rb') as f:
169
- content = f.read()
170
-
171
- if extension in ['.htm','.html','.txt','.xml']:
172
- content = content.decode('utf-8', errors='replace')
173
-
174
- yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
175
- else:
176
- print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
177
-
178
- # if loaded from sgml_content
179
- else:
180
- yield self.documents[idx]
181
-
182
-
183
-
184
-
185
- def save(self, output_dir="filings"):
186
- file_dir = Path(output_dir) / str(self.accession)
187
- file_dir.mkdir(parents=True, exist_ok=True)
188
-
189
- metadata_path = file_dir / "metadata.json"
190
- with open(metadata_path, 'w') as f:
191
- json.dump(self.metadata.content, f, indent=4)
192
-
193
- for idx, doc in enumerate(self.metadata.content['documents']):
194
- filename = doc.get('filename')
195
- if filename is None:
196
- filename = f"{doc.get('sequence')}.txt"
197
-
198
- doc_path = file_dir / filename
199
-
200
- if self.path is not None:
201
- if hasattr(self, 'documents') and self.documents:
202
- content = self.documents[idx].content
203
- else:
204
- orig_doc_path = self.path / filename
205
- if orig_doc_path.exists():
206
- with open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
207
- content = f.read()
208
- else:
209
- print(f"Warning: File {orig_doc_path} does not exist, skipping.")
210
- continue
211
- else:
212
- content = self.documents[idx].content
213
-
214
- if isinstance(content, bytes):
215
- with open(doc_path, 'wb') as f:
216
- f.write(content)
217
- else:
218
- with open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
219
- f.write(content)
220
-
221
- return file_dir
222
-
223
- async def save_async(self, output_dir="filings"):
224
- file_dir = Path(output_dir) / str(self.accession)
225
- os.makedirs(file_dir, exist_ok=True)
226
-
227
- metadata_path = file_dir / "metadata.json"
228
- async with aiofiles.open(metadata_path, 'w') as f:
229
- await f.write(json.dumps(self.metadata.content, indent=4))
230
-
231
- for idx, doc in enumerate(self.metadata.content['documents']):
232
- filename = doc.get('filename')
233
- # oh we need handling here for sequences case
234
- if filename is None:
235
- filename = doc['sequence'] + '.txt'
236
-
237
-
238
- doc_path = file_dir / filename
239
-
240
- if self.path is not None:
241
- if hasattr(self, 'documents') and self.documents:
242
- content = self.documents[idx].content
243
- else:
244
- orig_doc_path = self.path / filename
245
- if orig_doc_path.exists():
246
- async with aiofiles.open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
247
- content = await f.read()
248
- else:
249
- print(f"Warning: File {orig_doc_path} does not exist, skipping.")
250
- continue
251
- else:
252
- content = self.documents[idx].content
253
-
254
- if isinstance(content, bytes):
255
- async with aiofiles.open(doc_path, 'wb') as f:
256
- await f.write(content)
257
- else:
258
- async with aiofiles.open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
259
- await f.write(content)
260
-
261
- return file_dir
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes