datamule 1.6.0__py3-none-any.whl → 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/portfolio.py CHANGED
@@ -1,11 +1,13 @@
1
1
  from pathlib import Path
2
2
  from tqdm import tqdm
3
- from concurrent.futures import ThreadPoolExecutor
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
4
  from .submission import Submission
5
5
  from .sec.submissions.downloader import download as sec_download
6
6
  from .sec.submissions.textsearch import filter_text
7
7
  from .config import Config
8
8
  import os
9
+ import tarfile
10
+ from threading import Lock
9
11
  from .helper import _process_cik_and_metadata_filters
10
12
  from .seclibrary.downloader import download as seclibrary_download
11
13
  from .sec.xbrl.filter_xbrl import filter_xbrl
@@ -21,6 +23,10 @@ class Portfolio:
21
23
  self.submissions = []
22
24
  self.submissions_loaded = False
23
25
  self.MAX_WORKERS = os.cpu_count() - 1
26
+
27
+ # Batch tar support
28
+ self.batch_tar_handles = {} # {batch_tar_path: tarfile_handle}
29
+ self.batch_tar_locks = {} # {batch_tar_path: threading.Lock}
24
30
 
25
31
  self.monitor = Monitor()
26
32
 
@@ -34,9 +40,13 @@ class Portfolio:
34
40
  self.api_key = api_key
35
41
 
36
42
  def _load_submissions(self):
37
- folders = [f for f in self.path.iterdir() if f.is_dir() or f.suffix=='.tar']
38
- print(f"Loading {len(folders)} submissions")
43
+ print(f"Loading submissions")
44
+
45
+ # Separate regular and batch items
46
+ regular_items = [f for f in self.path.iterdir() if (f.is_dir() or f.suffix=='.tar') and 'batch' not in f.name]
47
+ batch_tars = [f for f in self.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
39
48
 
49
+ # Load regular submissions (existing logic)
40
50
  def load_submission(folder):
41
51
  try:
42
52
  return Submission(folder)
@@ -44,17 +54,86 @@ class Portfolio:
44
54
  print(f"Error loading submission from {folder}: {str(e)}")
45
55
  return None
46
56
 
47
- with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
48
- self.submissions = list(tqdm(
49
- executor.map(load_submission, folders),
50
- total=len(folders),
51
- desc="Loading submissions"
52
- ))
53
-
54
- # Filter out None values from failed submissions
55
- self.submissions = [s for s in self.submissions if s is not None]
57
+ regular_submissions = []
58
+ if regular_items:
59
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
60
+ regular_submissions = list(tqdm(
61
+ executor.map(load_submission, regular_items),
62
+ total=len(regular_items),
63
+ desc="Loading regular submissions"
64
+ ))
65
+
66
+ # Load batch submissions with parallel processing + progress
67
+ batch_submissions = []
68
+ if batch_tars:
69
+ with tqdm(desc="Loading batch submissions", unit="submissions") as pbar:
70
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
71
+ # Submit all batch tar jobs
72
+ futures = [
73
+ executor.submit(self._load_batch_submissions_worker, batch_tar, pbar)
74
+ for batch_tar in batch_tars
75
+ ]
76
+
77
+ # Collect results as they complete
78
+ for future in as_completed(futures):
79
+ try:
80
+ batch_submissions.extend(future.result())
81
+ except Exception as e:
82
+ print(f"Error in batch processing: {str(e)}")
83
+
84
+ # Combine and filter None values
85
+ self.submissions = [s for s in (regular_submissions + batch_submissions) if s is not None]
56
86
  print(f"Successfully loaded {len(self.submissions)} submissions")
57
87
 
88
+ def _load_batch_submissions_worker(self, batch_tar_path, pbar):
89
+ """Worker function to load submissions from one batch tar with progress updates"""
90
+ try:
91
+ # Open tar handle and store it
92
+ tar_handle = tarfile.open(batch_tar_path, 'r')
93
+ self.batch_tar_handles[batch_tar_path] = tar_handle
94
+ self.batch_tar_locks[batch_tar_path] = Lock()
95
+
96
+ # Find all accession directories
97
+ accession_prefixes = set()
98
+ for member in tar_handle.getmembers():
99
+ if '/' in member.name and member.name.endswith('metadata.json'):
100
+ accession_prefix = member.name.split('/')[0]
101
+ accession_prefixes.add(accession_prefix)
102
+
103
+ # Create submissions for each accession
104
+ submissions = []
105
+ for accession_prefix in accession_prefixes:
106
+ try:
107
+ submission = Submission(
108
+ batch_tar_path=batch_tar_path,
109
+ accession_prefix=accession_prefix,
110
+ portfolio_ref=self
111
+ )
112
+ submissions.append(submission)
113
+ pbar.update(1) # Update progress for each successful submission
114
+ except Exception as e:
115
+ print(f"Error loading batch submission {accession_prefix} from {batch_tar_path.name}: {str(e)}")
116
+
117
+ return submissions
118
+
119
+ except Exception as e:
120
+ print(f"Error loading batch tar {batch_tar_path}: {str(e)}")
121
+ return []
122
+
123
+ def _close_batch_handles(self):
124
+ """Close all open batch tar handles to free resources"""
125
+ for handle in self.batch_tar_handles.values():
126
+ try:
127
+ handle.close()
128
+ except Exception as e:
129
+ print(f"Error closing batch tar handle: {str(e)}")
130
+ self.batch_tar_handles.clear()
131
+ self.batch_tar_locks.clear()
132
+
133
+ def __del__(self):
134
+ """Cleanup batch tar handles on destruction"""
135
+ self._close_batch_handles()
136
+
58
137
  def process_submissions(self, callback):
59
138
  """Process all submissions using a thread pool."""
60
139
  if not self.submissions_loaded:
@@ -169,6 +248,7 @@ class Portfolio:
169
248
  )
170
249
 
171
250
  self.submissions_loaded = False
251
+
172
252
  def monitor_submissions(self, data_callback=None, interval_callback=None,
173
253
  polling_interval=1000, quiet=True, start_date=None,
174
254
  validation_interval=600000):
@@ -9,16 +9,14 @@ from .eftsquery import EFTSQuery
9
9
  import aiohttp
10
10
  from zoneinfo import ZoneInfo
11
11
 
12
- async def poll_rss(limiter):
12
+ async def poll_rss(limiter, session):
13
13
  base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
14
14
 
15
- # Create a session specifically for this RSS polling operation
16
- async with aiohttp.ClientSession(headers=headers) as session:
17
- # Use the rate limiter before making the request
18
- async with limiter:
19
- # Make the HTTP request with the session
20
- async with session.get(base_url) as response:
21
- content = await response.read()
15
+ # Use the rate limiter before making the request
16
+ async with limiter:
17
+ # Use the provided session instead of creating a new one
18
+ async with session.get(base_url) as response:
19
+ content = await response.read()
22
20
 
23
21
  # Process the content
24
22
  content_str = content.decode('utf-8')
@@ -70,12 +68,31 @@ class Monitor():
70
68
  self.ratelimiters = {'sec.gov': PreciseRateLimiter(rate=5)}
71
69
  self.efts_query = EFTSQuery(quiet=True)
72
70
  self.efts_query.limiter = self.ratelimiters['sec.gov']
71
+ self.session = None
72
+ self.session_created_at = 0
73
+ self.session_lifetime = 300 # 5 minutes in seconds
73
74
 
74
75
  def set_domain_rate_limit(self, domain, rate):
75
76
  self.ratelimiters[domain] = PreciseRateLimiter(rate=rate)
76
77
  if domain == 'sec.gov':
77
78
  self.efts_query.limiter = self.ratelimiters[domain]
78
79
 
80
+ async def _ensure_fresh_session(self):
81
+ """Ensure we have a fresh session, recreating if expired or missing"""
82
+ current_time = time.time()
83
+
84
+ # Check if we need a new session
85
+ if (self.session is None or
86
+ current_time - self.session_created_at > self.session_lifetime):
87
+
88
+ # Close old session if it exists
89
+ if self.session:
90
+ await self.session.close()
91
+
92
+ # Create new session
93
+ self.session = aiohttp.ClientSession(headers=headers)
94
+ self.session_created_at = current_time
95
+
79
96
  async def _async_run_efts_query(self, **kwargs):
80
97
  """Async helper method to run EFTS query without creating a new event loop"""
81
98
  # Make sure to set quiet parameter if provided in kwargs
@@ -103,83 +120,106 @@ class Monitor():
103
120
  if polling_interval is None and validation_interval is None:
104
121
  raise ValueError("At least one of polling_interval or validation_interval must be specified")
105
122
 
106
- # Backfill if start_date is provided
107
- if start_date is not None:
108
- today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
109
- if not quiet:
110
- print(f"Backfilling from {start_date} to {today_date}")
111
-
112
- hits = clean_efts_hits(await self._async_run_efts_query(
113
- filing_date=(start_date, today_date),
114
- quiet=quiet
115
- ))
116
-
117
- new_hits = self._filter_new_accessions(hits)
118
- if not quiet:
119
- print(f"New submissions found: {len(new_hits)}")
120
- if new_hits and data_callback:
121
- data_callback(new_hits)
122
-
123
- # Initialize timing variables
124
- current_time = time.time()
125
- last_polling_time = current_time
126
- last_validation_time = current_time
127
-
128
- # Determine which operations to perform
129
- do_polling = polling_interval is not None
130
- do_validation = validation_interval is not None
123
+ # Ensure we have a fresh session
124
+ await self._ensure_fresh_session()
131
125
 
132
- while True:
133
- current_time = time.time()
134
-
135
- # RSS polling (if enabled)
136
- if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
137
- if not quiet:
138
- print(f"Polling RSS feed")
139
- results = await poll_rss(self.ratelimiters['sec.gov'])
140
- new_results = self._filter_new_accessions(results)
141
- if new_results:
142
- if not quiet:
143
- print(f"Found {len(new_results)} new submissions via RSS")
144
- if data_callback:
145
- data_callback(new_results)
146
- last_polling_time = current_time
147
-
148
- # EFTS validation (if enabled)
149
- if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
150
- # Get submissions from the last 24 hours for validation
126
+ try:
127
+ # Backfill if start_date is provided
128
+ if start_date is not None:
151
129
  today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
152
130
  if not quiet:
153
- print(f"Validating submissions from {today_date}")
131
+ print(f"Backfilling from {start_date} to {today_date}")
154
132
 
155
133
  hits = clean_efts_hits(await self._async_run_efts_query(
156
- filing_date=(today_date, today_date),
134
+ filing_date=(start_date, today_date),
157
135
  quiet=quiet
158
136
  ))
159
-
137
+
160
138
  new_hits = self._filter_new_accessions(hits)
161
- if new_hits:
162
- if not quiet:
163
- print(f"Found {len(new_hits)} new submissions via EFTS validation")
164
- if data_callback:
165
- data_callback(new_hits)
166
- last_validation_time = current_time
139
+ if not quiet:
140
+ print(f"New submissions found: {len(new_hits)}")
141
+ if new_hits and data_callback:
142
+ data_callback(new_hits)
143
+
144
+ # Initialize timing variables
145
+ current_time = time.time()
146
+ last_polling_time = current_time
147
+ last_validation_time = current_time
167
148
 
168
- # Interval callback
169
- if interval_callback:
170
- interval_callback()
171
-
172
- # Calculate next wake-up time
173
- next_times = []
174
- if do_polling:
175
- next_times.append(last_polling_time + (polling_interval / 1000))
176
- if do_validation:
177
- next_times.append(last_validation_time + (validation_interval / 1000))
149
+ # Determine which operations to perform
150
+ do_polling = polling_interval is not None
151
+ do_validation = validation_interval is not None
178
152
 
179
- next_wake_time = min(next_times)
180
- current_time = time.time()
181
- time_to_sleep = max(0, next_wake_time - current_time)
182
- await asyncio.sleep(time_to_sleep)
153
+ while True:
154
+ current_time = time.time()
155
+
156
+ # RSS polling (if enabled)
157
+ if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
158
+ if not quiet:
159
+ print(f"Polling RSS feed")
160
+
161
+ # Ensure session is fresh before polling
162
+ await self._ensure_fresh_session()
163
+
164
+ try:
165
+ results = await poll_rss(self.ratelimiters['sec.gov'], self.session)
166
+ new_results = self._filter_new_accessions(results)
167
+ if new_results:
168
+ if not quiet:
169
+ print(f"Found {len(new_results)} new submissions via RSS")
170
+ if data_callback:
171
+ data_callback(new_results)
172
+ except Exception as e:
173
+ if not quiet:
174
+ print(f"RSS polling error: {e}, will recreate session on next poll")
175
+ # Force session recreation on next poll
176
+ if self.session:
177
+ await self.session.close()
178
+ self.session = None
179
+
180
+ last_polling_time = current_time
181
+
182
+ # EFTS validation (if enabled)
183
+ if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
184
+ # Get submissions from the last 24 hours for validation
185
+ today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
186
+ if not quiet:
187
+ print(f"Validating submissions from {today_date}")
188
+
189
+ hits = clean_efts_hits(await self._async_run_efts_query(
190
+ filing_date=(today_date, today_date),
191
+ quiet=quiet
192
+ ))
193
+
194
+ new_hits = self._filter_new_accessions(hits)
195
+ if new_hits:
196
+ if not quiet:
197
+ print(f"Found {len(new_hits)} new submissions via EFTS validation")
198
+ if data_callback:
199
+ data_callback(new_hits)
200
+ last_validation_time = current_time
201
+
202
+ # Interval callback
203
+ if interval_callback:
204
+ interval_callback()
205
+
206
+ # Calculate next wake-up time
207
+ next_times = []
208
+ if do_polling:
209
+ next_times.append(last_polling_time + (polling_interval / 1000))
210
+ if do_validation:
211
+ next_times.append(last_validation_time + (validation_interval / 1000))
212
+
213
+ next_wake_time = min(next_times)
214
+ current_time = time.time()
215
+ time_to_sleep = max(0, next_wake_time - current_time)
216
+ await asyncio.sleep(time_to_sleep)
217
+
218
+ finally:
219
+ # Clean up the session when done
220
+ if self.session:
221
+ await self.session.close()
222
+ self.session = None
183
223
 
184
224
  def monitor_submissions(self, data_callback=None, interval_callback=None,
185
225
  polling_interval=1000, quiet=True, start_date=None,
@@ -8,13 +8,15 @@ import ssl
8
8
  import zstandard as zstd
9
9
  import io
10
10
  import json
11
+ import tarfile
11
12
  from concurrent.futures import ThreadPoolExecutor
12
13
  from functools import partial
13
14
  from queue import Queue, Empty
14
- from threading import Thread
15
+ from threading import Thread, Lock
15
16
  from .query import query
16
17
  from os import cpu_count
17
- from secsgml import write_sgml_file_to_tar
18
+ from secsgml import parse_sgml_content_into_memory
19
+ from secsgml.utils import bytes_to_str
18
20
 
19
21
 
20
22
 
@@ -24,25 +26,19 @@ class Downloader:
24
26
  self.CHUNK_SIZE = 2 * 1024 * 1024
25
27
  self.MAX_CONCURRENT_DOWNLOADS = 100
26
28
  self.MAX_DECOMPRESSION_WORKERS = cpu_count()
27
- self.MAX_PROCESSING_WORKERS = cpu_count()
28
- self.QUEUE_SIZE = 10
29
+ self.MAX_TAR_WORKERS = cpu_count()
29
30
  if api_key is not None:
30
31
  self._api_key = api_key
31
- # Create a shared event loop for async operations
32
32
  self.loop = asyncio.new_event_loop()
33
- # Create a thread to run the event loop
34
33
  self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
35
34
  self.loop_thread.start()
36
- # Create a queue for async tasks
37
35
  self.async_queue = Queue()
38
36
 
39
37
  def _run_event_loop(self):
40
- """Run the event loop in a separate thread"""
41
38
  asyncio.set_event_loop(self.loop)
42
39
  self.loop.run_forever()
43
40
 
44
41
  def _run_coroutine(self, coro):
45
- """Run a coroutine in the event loop and return its result"""
46
42
  future = asyncio.run_coroutine_threadsafe(coro, self.loop)
47
43
  return future.result()
48
44
 
@@ -72,65 +68,94 @@ class Downloader:
72
68
  except Exception as e:
73
69
  print(f"Failed to log error to {error_file}: {str(e)}")
74
70
 
75
- class FileProcessor:
76
- def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[], keep_filtered_metadata=False,standardize_metadata=True):
77
- self.processing_queue = Queue(maxsize=queue_size)
78
- self.should_stop = False
79
- self.processing_workers = []
71
+ class TarManager:
72
+ def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
80
73
  self.output_dir = output_dir
81
- self.max_workers = max_workers
82
- self.batch_size = 50
83
- self.pbar = pbar
84
- self.downloader = downloader
85
- self.keep_document_types = keep_document_types
86
- self.keep_filtered_metadata = keep_filtered_metadata
87
- self.standardize_metadata = standardize_metadata
88
-
89
- def start_processing_workers(self):
90
- for _ in range(self.max_workers):
91
- worker = Thread(target=self._processing_worker)
92
- worker.daemon = True
93
- worker.start()
94
- self.processing_workers.append(worker)
95
-
96
- def _process_file(self, item):
97
- filename, content = item
98
- output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
99
- write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types, keep_filtered_metadata=self.keep_filtered_metadata,standardize_metadata=self.standardize_metadata)
100
-
101
- self.pbar.update(1)
102
-
103
- def _processing_worker(self):
104
- batch = []
105
- while not self.should_stop:
74
+ self.num_tar_files = num_tar_files
75
+ self.max_batch_size = max_batch_size
76
+ self.tar_files = {}
77
+ self.tar_locks = {}
78
+ self.file_counters = {}
79
+ self.tar_sizes = {}
80
+ self.tar_sequences = {}
81
+
82
+ for i in range(num_tar_files):
83
+ tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
84
+ self.tar_files[i] = tarfile.open(tar_path, 'w')
85
+ self.tar_locks[i] = Lock()
86
+ self.file_counters[i] = 0
87
+ self.tar_sizes[i] = 0
88
+ self.tar_sequences[i] = 1
89
+
90
+ def get_tar_index(self, filename):
91
+ return hash(filename) % self.num_tar_files
92
+
93
+ def write_submission(self, filename, metadata, documents, standardize_metadata):
94
+ tar_index = self.get_tar_index(filename)
95
+ accession_num = filename.split('.')[0]
96
+
97
+ metadata_str = bytes_to_str(metadata, lower=False)
98
+ metadata_json = json.dumps(metadata_str).encode('utf-8')
99
+ submission_size = len(metadata_json) + sum(len(doc) for doc in documents)
100
+
101
+ with self.tar_locks[tar_index]:
102
+ if self.tar_sizes[tar_index] > 0 and self.tar_sizes[tar_index] + submission_size > self.max_batch_size:
103
+ tar = self.tar_files[tar_index]
104
+ tar.close()
105
+
106
+ self.tar_sequences[tar_index] += 1
107
+ new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
108
+ self.tar_files[tar_index] = tarfile.open(new_tar_path, 'w')
109
+ self.file_counters[tar_index] = 0
110
+ self.tar_sizes[tar_index] = 0
111
+
112
+ tar = self.tar_files[tar_index]
113
+
106
114
  try:
107
- item = self.processing_queue.get(timeout=1)
108
- if item is None:
109
- break
110
-
111
- batch.append(item)
112
-
113
- if len(batch) >= self.batch_size or self.processing_queue.empty():
114
- for item in batch:
115
- self._process_file(item)
116
- self.processing_queue.task_done()
117
- batch = []
118
-
119
- except Empty:
120
- if batch:
121
- for item in batch:
122
- self._process_file(item)
123
- self.processing_queue.task_done()
124
- batch = []
125
-
126
- def stop_workers(self):
127
- self.should_stop = True
128
- for _ in self.processing_workers:
129
- self.processing_queue.put(None)
130
- for worker in self.processing_workers:
131
- worker.join()
115
+ tarinfo = tarfile.TarInfo(name=f'{accession_num}/metadata.json')
116
+ tarinfo.size = len(metadata_json)
117
+ tar.addfile(tarinfo, io.BytesIO(metadata_json))
118
+
119
+ for file_num, content in enumerate(documents):
120
+ doc_name = self._get_document_name(metadata, file_num, standardize_metadata)
121
+ tarinfo = tarfile.TarInfo(name=f'{accession_num}/{doc_name}')
122
+ tarinfo.size = len(content)
123
+ tar.addfile(tarinfo, io.BytesIO(content))
124
+
125
+ self.file_counters[tar_index] += 1
126
+ self.tar_sizes[tar_index] += submission_size
127
+ return True
128
+
129
+ except Exception as e:
130
+ print(f"Error writing {filename} to tar {tar_index}: {str(e)}")
131
+ return False
132
+
133
+ def _get_document_name(self, metadata, file_num, standardize_metadata):
134
+ if standardize_metadata:
135
+ documents_key = b'documents'
136
+ filename_key = b'filename'
137
+ sequence_key = b'sequence'
138
+ else:
139
+ documents_key = b'DOCUMENTS'
140
+ filename_key = b'FILENAME'
141
+ sequence_key = b'SEQUENCE'
142
+
143
+ doc_metadata = metadata[documents_key][file_num]
144
+ filename = doc_metadata.get(filename_key)
145
+ if filename:
146
+ return filename.decode('utf-8')
147
+ else:
148
+ sequence = doc_metadata.get(sequence_key, b'document')
149
+ return sequence.decode('utf-8') + '.txt'
150
+
151
+ def close_all(self):
152
+ for i, tar in self.tar_files.items():
153
+ try:
154
+ tar.close()
155
+ except Exception as e:
156
+ print(f"Error closing tar {i}: {str(e)}")
132
157
 
133
- def decompress_stream(self, compressed_chunks, filename, output_dir, processor):
158
+ def decompress_and_parse_and_write(self, compressed_chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
134
159
  dctx = zstd.ZstdDecompressor()
135
160
  try:
136
161
  input_buffer = io.BytesIO(b''.join(compressed_chunks))
@@ -140,11 +165,19 @@ class Downloader:
140
165
  shutil.copyfileobj(reader, decompressed_content)
141
166
 
142
167
  content = decompressed_content.getvalue()
143
- processor.processing_queue.put((filename, content))
144
- return True
168
+
169
+ metadata, documents = parse_sgml_content_into_memory(
170
+ bytes_content=content,
171
+ filter_document_types=keep_document_types,
172
+ keep_filtered_metadata=keep_filtered_metadata,
173
+ standardize_metadata=standardize_metadata
174
+ )
175
+
176
+ success = tar_manager.write_submission(filename, metadata, documents, standardize_metadata)
177
+ return success
145
178
 
146
179
  except Exception as e:
147
- self._log_error(output_dir, filename, f"Decompression error: {str(e)}")
180
+ self._log_error(output_dir, filename, f"Decompression/parsing error: {str(e)}")
148
181
  return False
149
182
  finally:
150
183
  try:
@@ -153,17 +186,25 @@ class Downloader:
153
186
  except:
154
187
  pass
155
188
 
156
- def save_regular_file(self, chunks, filename, output_dir, processor):
189
+ def parse_and_write_regular_file(self, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
157
190
  try:
158
191
  content = b''.join(chunks)
159
- processor.processing_queue.put((filename, content))
160
- return True
192
+
193
+ metadata, documents = parse_sgml_content_into_memory(
194
+ bytes_content=content,
195
+ filter_document_types=keep_document_types,
196
+ keep_filtered_metadata=keep_filtered_metadata,
197
+ standardize_metadata=standardize_metadata
198
+ )
199
+
200
+ success = tar_manager.write_submission(filename, metadata, documents, standardize_metadata)
201
+ return success
161
202
 
162
203
  except Exception as e:
163
- self._log_error(output_dir, filename, f"Error saving file: {str(e)}")
204
+ self._log_error(output_dir, filename, f"Parsing error: {str(e)}")
164
205
  return False
165
206
 
166
- async def download_and_process(self, session, url, semaphore, decompression_pool, output_dir, processor):
207
+ async def download_and_process(self, session, url, semaphore, decompression_pool, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir, pbar):
167
208
  async with semaphore:
168
209
  chunks = []
169
210
  filename = url.split('/')[-1]
@@ -188,70 +229,70 @@ class Downloader:
188
229
  if filename.endswith('.zst'):
189
230
  success = await loop.run_in_executor(
190
231
  decompression_pool,
191
- partial(self.decompress_stream, chunks, filename, output_dir, processor)
232
+ partial(self.decompress_and_parse_and_write, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
192
233
  )
193
234
  else:
194
235
  success = await loop.run_in_executor(
195
236
  decompression_pool,
196
- partial(self.save_regular_file, chunks, filename, output_dir, processor)
237
+ partial(self.parse_and_write_regular_file, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
197
238
  )
198
239
 
199
240
  if not success:
200
241
  self._log_error(output_dir, filename, "Failed to process file")
242
+
201
243
  elif response.status == 401:
202
244
  self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
203
245
  raise ValueError("Invalid API key")
204
246
  else:
205
247
  self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
248
+
249
+ pbar.update(1)
250
+
206
251
  except Exception as e:
207
252
  self._log_error(output_dir, filename, str(e))
253
+ pbar.update(1)
208
254
 
209
- async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
255
+ async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True, max_batch_size=1024*1024*1024):
210
256
  os.makedirs(output_dir, exist_ok=True)
211
257
 
212
- with tqdm(total=len(urls), desc="Processing files") as pbar:
213
- processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types,
214
- keep_filtered_metadata=keep_filtered_metadata,standardize_metadata=standardize_metadata)
215
- processor.start_processing_workers()
216
-
217
- semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
218
- decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
219
-
220
- connector = aiohttp.TCPConnector(
221
- limit=self.MAX_CONCURRENT_DOWNLOADS,
222
- force_close=False,
223
- ssl=ssl.create_default_context(),
224
- ttl_dns_cache=300,
225
- keepalive_timeout=60
226
- )
227
-
228
- # timeout should be max 30s.
229
- async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
230
- tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
231
- await asyncio.gather(*tasks, return_exceptions=True)
232
-
233
- processor.processing_queue.join()
234
- processor.stop_workers()
235
- decompression_pool.shutdown()
258
+ num_tar_files = min(self.MAX_TAR_WORKERS, len(urls))
259
+
260
+ tar_manager = self.TarManager(output_dir, num_tar_files, max_batch_size)
261
+
262
+ try:
263
+ with tqdm(total=len(urls), desc="Processing files") as pbar:
264
+ semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
265
+ decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
266
+
267
+ connector = aiohttp.TCPConnector(
268
+ limit=self.MAX_CONCURRENT_DOWNLOADS,
269
+ force_close=False,
270
+ ssl=ssl.create_default_context(),
271
+ ttl_dns_cache=300,
272
+ keepalive_timeout=60
273
+ )
274
+
275
+ async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
276
+ tasks = [
277
+ self.download_and_process(
278
+ session, url, semaphore, decompression_pool,
279
+ keep_document_types, keep_filtered_metadata, standardize_metadata,
280
+ tar_manager, output_dir, pbar
281
+ )
282
+ for url in urls
283
+ ]
284
+ await asyncio.gather(*tasks, return_exceptions=True)
285
+
286
+ decompression_pool.shutdown()
287
+
288
+ finally:
289
+ tar_manager.close_all()
236
290
 
237
291
  def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True,
238
- skip_accession_numbers=[]):
239
- """
240
- Query SEC filings and download/process them.
241
-
242
- Parameters:
243
- - submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
244
- - cik: Company CIK number(s), string, int, or list
245
- - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
246
- - output_dir: Directory to save downloaded files
247
- - accession_numbers: List of specific accession numbers to download
248
- - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
249
- - keep_filtered_metadata: Whether to keep metadata for filtered documents
250
- """
292
+ skip_accession_numbers=[], max_batch_size=1024*1024*1024):
251
293
  if self.api_key is None:
252
294
  raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
253
295
 
254
- # Query the SEC filings first - before starting any async operations
255
296
  print("Querying SEC filings...")
256
297
  filings = query(
257
298
  submission_type=submission_type,
@@ -260,19 +301,14 @@ class Downloader:
260
301
  api_key=self.api_key
261
302
  )
262
303
 
263
-
264
- # After querying but before generating URLs
265
304
  if accession_numbers:
266
305
  accession_numbers = [str(int(item.replace('-',''))) for item in accession_numbers]
267
306
  filings = [filing for filing in filings if filing['accession_number'] in accession_numbers]
268
307
 
269
-
270
308
  if skip_accession_numbers:
271
309
  skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
272
310
  filings = [filing for filing in filings if filing['accession_number'] not in skip_accession_numbers]
273
311
 
274
- # Generate URLs from the query results
275
-
276
312
  print(f"Generating URLs for {len(filings)} filings...")
277
313
  urls = []
278
314
  for item in filings:
@@ -285,38 +321,21 @@ class Downloader:
285
321
  print("No submissions found matching the criteria")
286
322
  return
287
323
 
288
- # Remove duplicates
289
324
  urls = list(set(urls))
290
325
 
291
- # Now start the async processing
292
326
  start_time = time.time()
293
327
 
294
- # Process the batch asynchronously
295
- asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata))
328
+ asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata, max_batch_size=max_batch_size))
296
329
 
297
- # Calculate and display performance metrics
298
330
  elapsed_time = time.time() - start_time
299
331
  print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
300
332
  print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
301
333
 
302
334
  def __del__(self):
303
- """Cleanup when the downloader is garbage collected"""
304
335
  if hasattr(self, 'loop') and self.loop.is_running():
305
336
  self.loop.call_soon_threadsafe(self.loop.stop)
306
337
 
307
-
308
-
309
- def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
310
- """
311
- Download and process SEC filings using specific filenames.
312
-
313
- Parameters:
314
- - filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
315
- - output_dir: Directory to save downloaded files
316
- - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
317
- - keep_filtered_metadata: Whether to keep metadata for filtered documents
318
- - standardize_metadata: Whether to standardize metadata format
319
- """
338
+ def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True, max_batch_size=1024*1024*1024):
320
339
  if self.api_key is None:
321
340
  raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
322
341
 
@@ -326,27 +345,23 @@ class Downloader:
326
345
  if not isinstance(filenames, (list, tuple)):
327
346
  filenames = [filenames]
328
347
 
329
- # Validate filenames format
330
348
  for filename in filenames:
331
349
  if not isinstance(filename, str):
332
350
  raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
333
351
  if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
334
352
  raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
335
353
 
336
- # Generate URLs directly from filenames
337
354
  print(f"Generating URLs for {len(filenames)} files...")
338
355
  urls = []
339
356
  for filename in filenames:
340
357
  url = f"{self.BASE_URL}{filename}"
341
358
  urls.append(url)
342
359
 
343
- # Remove duplicates while preserving order
344
360
  seen = set()
345
361
  urls = [url for url in urls if not (url in seen or seen.add(url))]
346
362
 
347
363
  print(f"Downloading {len(urls)} files...")
348
364
 
349
- # Process the batch asynchronously using existing infrastructure
350
365
  start_time = time.time()
351
366
 
352
367
  asyncio.run(self.process_batch(
@@ -354,33 +369,19 @@ class Downloader:
354
369
  output_dir,
355
370
  keep_document_types=keep_document_types,
356
371
  keep_filtered_metadata=keep_filtered_metadata,
357
- standardize_metadata=standardize_metadata
372
+ standardize_metadata=standardize_metadata,
373
+ max_batch_size=max_batch_size
358
374
  ))
359
375
 
360
- # Calculate and display performance metrics
361
376
  elapsed_time = time.time() - start_time
362
377
  print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
363
378
  print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
364
379
 
365
380
 
366
381
  def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
367
- skip_accession_numbers=[]):
368
- """
369
- Query SEC filings and download/process them.
370
-
371
- Parameters:
372
- - submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
373
- - cik: Company CIK number(s), string, int, or list
374
- - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
375
- - api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
376
- - output_dir: Directory to save downloaded files
377
- - accession_numbers: List of specific accession numbers to download
378
- - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
379
- - keep_filtered_metadata: Whether to keep metadata for filtered documents
380
- """
382
+ skip_accession_numbers=[], max_batch_size=1024*1024*1024):
381
383
  if accession_numbers:
382
384
  accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
383
- # check if acc no is empty list
384
385
  elif accession_numbers == []:
385
386
  raise ValueError("Applied filter resulted in empty accession numbers list")
386
387
  downloader = Downloader(api_key=api_key)
@@ -393,5 +394,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
393
394
  keep_document_types=keep_document_types,
394
395
  keep_filtered_metadata=keep_filtered_metadata,
395
396
  standardize_metadata=standardize_metadata,
396
- skip_accession_numbers=skip_accession_numbers
397
- )
397
+ skip_accession_numbers=skip_accession_numbers,
398
+ max_batch_size=max_batch_size
399
+ )
datamule/submission.py CHANGED
@@ -2,50 +2,15 @@ from pathlib import Path
2
2
  import json
3
3
  from .document.document import Document
4
4
  from secsgml import parse_sgml_content_into_memory
5
- from secsgml.utils import bytes_to_str
5
+ from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
6
6
  from secsgml.parse_sgml import transform_metadata_string
7
7
  import tarfile
8
8
  import shutil
9
9
  import zstandard as zstd
10
10
  import gzip
11
11
  import io
12
- import copy
13
12
 
14
13
 
15
- def calculate_documents_locations_in_tar(metadata, documents):
16
- # Step 1: Add placeholder byte positions to get accurate size (10-digit padded)
17
- placeholder_metadata = copy.deepcopy(metadata)
18
-
19
- for file_num in range(len(documents)):
20
- if 'documents' in placeholder_metadata:
21
- placeholder_metadata['documents'][file_num]['secsgml_start_byte'] = "9999999999" # 10 digits
22
- placeholder_metadata['documents'][file_num]['secsgml_end_byte'] = "9999999999" # 10 digits
23
-
24
- # Step 2: Calculate size with placeholders
25
- placeholder_str = bytes_to_str(placeholder_metadata, lower=False)
26
- placeholder_json = json.dumps(placeholder_str).encode('utf-8')
27
- metadata_size = len(placeholder_json)
28
-
29
- # Step 3: Now calculate actual positions using this size
30
- current_pos = 512 + metadata_size
31
- current_pos += (512 - (current_pos % 512)) % 512
32
-
33
- # Step 4: Calculate real positions and update original metadata (10-digit padded)
34
- for file_num, content in enumerate(documents):
35
- start_byte = current_pos + 512
36
- end_byte = start_byte + len(content)
37
-
38
- if 'documents' in metadata:
39
- metadata['documents'][file_num]['secsgml_start_byte'] = f"{start_byte:010d}" # 10-digit padding
40
- metadata['documents'][file_num]['secsgml_end_byte'] = f"{end_byte:010d}" # 10-digit padding
41
-
42
-
43
- file_total_size = 512 + len(content)
44
- padded_size = file_total_size + (512 - (file_total_size % 512)) % 512
45
- current_pos += padded_size
46
-
47
- return metadata
48
-
49
14
 
50
15
  def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
51
16
  # Write tar directly to disk
@@ -78,11 +43,21 @@ def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,
78
43
  tar.addfile(tarinfo, io.BytesIO(content))
79
44
 
80
45
  class Submission:
81
- def __init__(self, path=None,sgml_content=None,keep_document_types=None):
82
- if path is None and sgml_content is None:
83
- raise ValueError("Either path or sgml_content must be provided")
84
- if path is not None and sgml_content is not None:
85
- raise ValueError("Only one of path or sgml_content must be provided")
46
+ def __init__(self, path=None, sgml_content=None, keep_document_types=None,
47
+ batch_tar_path=None, accession_prefix=None, portfolio_ref=None):
48
+
49
+ # Validate parameters
50
+ param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path])
51
+ if param_count != 1:
52
+ raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
53
+
54
+ if batch_tar_path is not None and (accession_prefix is None or portfolio_ref is None):
55
+ raise ValueError("batch_tar_path requires both accession_prefix and portfolio_ref")
56
+
57
+ # Initialize batch tar attributes
58
+ self.batch_tar_path = batch_tar_path
59
+ self.accession_prefix = accession_prefix
60
+ self.portfolio_ref = portfolio_ref
86
61
 
87
62
  if sgml_content is not None:
88
63
  self.path = None
@@ -100,7 +75,7 @@ class Submission:
100
75
  filtered_metadata_documents = []
101
76
 
102
77
  for idx,doc in enumerate(self.metadata.content['documents']):
103
- type = doc.get('type')()
78
+ type = doc.get('type')
104
79
 
105
80
  # Keep only specified types
106
81
  if keep_document_types is not None and type not in keep_document_types:
@@ -115,7 +90,26 @@ class Submission:
115
90
 
116
91
  self.metadata.content['documents'] = filtered_metadata_documents
117
92
 
118
- if path is not None:
93
+ elif batch_tar_path is not None:
94
+ # Batch tar case
95
+ self.path = None
96
+
97
+ # Load metadata from batch tar
98
+ with self.portfolio_ref.batch_tar_locks[batch_tar_path]:
99
+ tar_handle = self.portfolio_ref.batch_tar_handles[batch_tar_path]
100
+ metadata_obj = tar_handle.extractfile(f'{accession_prefix}/metadata.json')
101
+ metadata = json.loads(metadata_obj.read().decode('utf-8'))
102
+
103
+ # Set metadata path using :: notation
104
+ metadata_path = f"{batch_tar_path}::{accession_prefix}/metadata.json"
105
+
106
+ # standardize metadata
107
+ metadata = transform_metadata_string(metadata)
108
+ self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
109
+ self.accession = self.metadata.content['accession-number']
110
+ self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
111
+
112
+ elif path is not None:
119
113
  self.path = Path(path)
120
114
  if self.path.suffix == '.tar':
121
115
  with tarfile.open(self.path,'r') as tar:
@@ -228,44 +222,86 @@ class Submission:
228
222
  doc = self.metadata.content['documents'][idx]
229
223
 
230
224
  # If loaded from sgml_content, return pre-loaded document
231
- if self.path is None:
225
+ if self.path is None and self.batch_tar_path is None:
232
226
  return self.documents[idx]
233
227
 
234
- # If loaded from path, load document on-demand
228
+ # Get filename
235
229
  filename = doc.get('filename')
236
230
  if filename is None:
237
231
  filename = doc['sequence'] + '.txt'
238
232
 
239
- document_path = self.path / filename
240
- extension = document_path.suffix
233
+ extension = Path(filename).suffix
241
234
 
242
- if self.path.suffix == '.tar':
243
- with tarfile.open(self.path, 'r') as tar:
244
- # bandaid fix TODO
245
- try:
246
- content = tar.extractfile(filename).read()
247
- except:
235
+ # Handle batch tar case
236
+ if self.batch_tar_path is not None:
237
+ with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
238
+ tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
239
+
240
+ # Try different filename variations for compressed files
241
+ possible_filenames = [
242
+ f'{self.accession_prefix}/{filename}',
243
+ f'{self.accession_prefix}/{filename}.gz',
244
+ f'{self.accession_prefix}/{filename}.zst'
245
+ ]
246
+
247
+ content = None
248
+ actual_filename = None
249
+ for attempt_filename in possible_filenames:
248
250
  try:
249
- content = tar.extractfile(filename+'.gz').read()
251
+ content = tar_handle.extractfile(attempt_filename).read()
252
+ actual_filename = attempt_filename
253
+ break
250
254
  except:
251
- try:
252
- content = tar.extractfile(filename+'.zst').read()
253
- except:
254
- # some of these issues are on SEC data end, will fix when I setup cloud.
255
- raise ValueError(f"Something went wrong with tar: {self.path}")
255
+ continue
256
+
257
+ if content is None:
258
+ raise ValueError(f"Could not find document in batch tar: {self.batch_tar_path}, accession: {self.accession_prefix}, filename: {filename}")
259
+
256
260
  # Decompress if compressed
257
- if filename.endswith('.gz'):
261
+ if actual_filename.endswith('.gz'):
258
262
  content = gzip.decompress(content)
259
- elif filename.endswith('.zst'):
263
+ elif actual_filename.endswith('.zst'):
260
264
  dctx = zstd.ZstdDecompressor()
261
265
  content = dctx.decompress(content)
266
+
267
+ # Decode text files
268
+ if extension in ['.htm', '.html', '.txt', '.xml']:
269
+ content = content.decode('utf-8', errors='replace')
270
+
271
+ document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
272
+
273
+ # Handle regular path case (existing logic)
262
274
  else:
263
- with document_path.open('rb') as f:
264
- content = f.read()
275
+ document_path = self.path / filename
276
+
277
+ if self.path.suffix == '.tar':
278
+ with tarfile.open(self.path, 'r') as tar:
279
+ # so here is where we should use bytes instead with byte offset.
280
+ # bandaid fix TODO
281
+ try:
282
+ content = tar.extractfile(filename).read()
283
+ except:
284
+ try:
285
+ content = tar.extractfile(filename+'.gz').read()
286
+ except:
287
+ try:
288
+ content = tar.extractfile(filename+'.zst').read()
289
+ except:
290
+ # some of these issues are on SEC data end, will fix when I setup cloud.
291
+ raise ValueError(f"Something went wrong with tar: {self.path}")
292
+ # Decompress if compressed
293
+ if filename.endswith('.gz'):
294
+ content = gzip.decompress(content)
295
+ elif filename.endswith('.zst'):
296
+ dctx = zstd.ZstdDecompressor()
297
+ content = dctx.decompress(content)
298
+ else:
299
+ with document_path.open('rb') as f:
300
+ content = f.read()
265
301
 
266
- # Decode text files
267
- if extension in ['.htm', '.html', '.txt', '.xml']:
268
- content = content.decode('utf-8', errors='replace')
302
+ # Decode text files
303
+ if extension in ['.htm', '.html', '.txt', '.xml']:
304
+ content = content.decode('utf-8', errors='replace')
269
305
 
270
306
  return Document(
271
307
  type=doc['type'],
File without changes
@@ -0,0 +1,150 @@
1
+ import zipfile
2
+ import json
3
+ import csv
4
+ import os
5
+ import tempfile
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ import threading
8
+ from tqdm import tqdm
9
+ import urllib.request
10
+
11
+ headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}
12
+
13
+ def process_file_batch(zip_file, filenames_batch):
14
+ """Process a batch of files from the zip archive"""
15
+ batch_filings = []
16
+
17
+ for filename in filenames_batch:
18
+ if not filename.startswith('CIK'):
19
+ continue
20
+
21
+ try:
22
+ # Extract CIK from filename
23
+ cik = int(filename.split('.')[0].split('-')[0][3:])
24
+
25
+ # Read raw bytes and parse JSON
26
+ with zip_file.open(filename) as file:
27
+ raw_data = file.read()
28
+ submissions_dct = json.loads(raw_data)
29
+
30
+ # Handle different file types
31
+ if 'submissions' in filename:
32
+ filings_data = submissions_dct
33
+ else:
34
+ filings_data = submissions_dct['filings']['recent']
35
+
36
+ # Extract required data
37
+ accession_numbers = filings_data['accessionNumber']
38
+ filing_dates = filings_data['filingDate']
39
+ forms = filings_data['form']
40
+
41
+ # Create filing records for this file
42
+ for j in range(len(accession_numbers)):
43
+ filing_record = {
44
+ 'accessionNumber': int(accession_numbers[j].replace('-','')),
45
+ 'filingDate': filing_dates[j],
46
+ 'submissionType': forms[j],
47
+ 'cik': cik
48
+ }
49
+ batch_filings.append(filing_record)
50
+
51
+ except Exception as e:
52
+ print(f"Error processing {filename}: {e}")
53
+ continue
54
+
55
+ return batch_filings
56
+
57
+ def write_csv_chunk(output_path, filings_data, is_first_write, write_lock):
58
+ """Thread-safe CSV writing with lock"""
59
+ with write_lock:
60
+ if is_first_write:
61
+ with open(output_path, 'w', newline='') as csvfile:
62
+ fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
63
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
64
+ writer.writeheader()
65
+ writer.writerows(filings_data)
66
+ else:
67
+ with open(output_path, 'a', newline='') as csvfile:
68
+ fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
69
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
70
+ writer.writerows(filings_data)
71
+
72
+ def construct_submissions_data(output_path, submissions_zip_path=None, max_workers=4, batch_size=100):
73
+ """Creates a list of dicts of every accession number, with filing date, submission type, and ciks"""
74
+
75
+ if submissions_zip_path is None:
76
+ url = "https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip"
77
+
78
+ temp_dir = tempfile.mkdtemp()
79
+ zip_path = os.path.join(temp_dir, 'submissions.zip')
80
+
81
+ req = urllib.request.Request(url, headers=headers)
82
+
83
+ with urllib.request.urlopen(req) as response:
84
+ total_size = int(response.headers.get('Content-Length', 0))
85
+
86
+ with open(zip_path, 'wb') as f, tqdm(
87
+ desc="Downloading",
88
+ total=total_size,
89
+ unit='B',
90
+ unit_scale=True,
91
+ unit_divisor=1024,
92
+ ) as pbar:
93
+ while True:
94
+ chunk = response.read(8192)
95
+ if not chunk:
96
+ break
97
+ f.write(chunk)
98
+ pbar.update(len(chunk))
99
+
100
+ submissions_zip_path = zip_path
101
+
102
+ # Keep zip file open throughout processing
103
+ with zipfile.ZipFile(submissions_zip_path, 'r') as zip_file:
104
+ # Get all CIK filenames
105
+ all_filenames = [f for f in zip_file.namelist() if f.startswith('CIK')]
106
+
107
+ print(f"Processing {len(all_filenames)} files with {max_workers} workers...")
108
+
109
+ # Create batches of filenames
110
+ filename_batches = []
111
+ for i in range(0, len(all_filenames), batch_size):
112
+ batch = all_filenames[i:i + batch_size]
113
+ filename_batches.append(batch)
114
+
115
+ # Setup for threading
116
+ write_lock = threading.Lock()
117
+ total_filings = 0
118
+ is_first_write = True
119
+
120
+ # Process batches with thread pool
121
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
122
+ # Submit all batch jobs
123
+ future_to_batch = {
124
+ executor.submit(process_file_batch, zip_file, batch): i
125
+ for i, batch in enumerate(filename_batches)
126
+ }
127
+
128
+ # Process results with progress bar
129
+ with tqdm(total=len(filename_batches), desc="Processing batches", unit="batch") as pbar:
130
+ for future in future_to_batch:
131
+ try:
132
+ batch_filings = future.result()
133
+
134
+ if batch_filings: # Only write if we have data
135
+ write_csv_chunk(output_path, batch_filings, is_first_write, write_lock)
136
+ is_first_write = False
137
+ total_filings += len(batch_filings)
138
+
139
+ pbar.update(1)
140
+ pbar.set_postfix({
141
+ 'filings': total_filings,
142
+ 'files': len(filename_batches[future_to_batch[future]])
143
+ })
144
+
145
+ except Exception as e:
146
+ print(f"Error processing batch: {e}")
147
+ pbar.update(1)
148
+
149
+ print(f"Complete! Processed {total_filings} total filings")
150
+ print(f"Data saved to {output_path}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.6.0
3
+ Version: 1.6.2
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -3,9 +3,9 @@ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
3
  datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
4
4
  datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
5
5
  datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
6
- datamule/portfolio.py,sha256=eF1eDSwIg-CI8ZmZAHRjCGU0UhuPN4ijxPB0YDT4s2o,8023
6
+ datamule/portfolio.py,sha256=360kfXmmnVFrmpz16KF2es6Mq94lnVqzie2DIgnMB9Y,11641
7
7
  datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
8
- datamule/submission.py,sha256=vAiYNas1YrWgm4Grw24peJbfSUVERySEko1zmdtG49s,13033
8
+ datamule/submission.py,sha256=f2pecbuhK0VmN1w0beNUiK4n_4Ma_GGQ5JIGilmZPZE,15127
9
9
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
10
10
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  datamule/datamule/sec_connector.py,sha256=T3edE7I-d4oHysqj7zYlIOxH3Fuauj9tfw39UdFWvB8,2393
@@ -51,7 +51,7 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
51
51
  datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
52
  datamule/sec/submissions/downloader.py,sha256=zGS0oJJI8tVF_GnVpZm20MymdYxnjrEjQioSVggw7Ck,1486
53
53
  datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
54
- datamule/sec/submissions/monitor.py,sha256=ll0nfHzG8FI3bA8zVFrfsfZGnbt5qAD4rRZ4LG2SORY,9567
54
+ datamule/sec/submissions/monitor.py,sha256=1JUMRYsTqtd31hX3UrUA_aXFUmZN6n-V7h0i1gavNOs,11395
55
55
  datamule/sec/submissions/streamer.py,sha256=Qydj40CmWB_wsPv2dibefRohmCokegG2pR7iZ9C3xLQ,11584
56
56
  datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
57
57
  datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -61,9 +61,11 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
61
61
  datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
62
62
  datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
64
- datamule/seclibrary/downloader.py,sha256=ylv69VF22IVfrdeCkiGr5mVa2GKrPC9zFiDJU1fiBu8,17262
64
+ datamule/seclibrary/downloader.py,sha256=3jEy67oiEg8BF20KcKCx2KC0UjHzhiepdu29TOaHWXs,17564
65
65
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
66
- datamule-1.6.0.dist-info/METADATA,sha256=E4F7MeBNWhHn19TH7eUyQN_vnONCvw-NiObNCRbsLE0,524
67
- datamule-1.6.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
68
- datamule-1.6.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
69
- datamule-1.6.0.dist-info/RECORD,,
66
+ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
+ datamule/utils/construct_submissions_data.py,sha256=aX7ZaAp3zXHLcv4TFk_rGwjb8r7yNDQDFVg4nPf60kM,5934
68
+ datamule-1.6.2.dist-info/METADATA,sha256=VOeuSq7t_-D7dKJjWrHuEg9zwqNvLWU08dGL7W2T0ow,524
69
+ datamule-1.6.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
70
+ datamule-1.6.2.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
71
+ datamule-1.6.2.dist-info/RECORD,,