datamule 1.2.2__py3-none-any.whl → 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. datamule/__init__.py +1 -0
  2. datamule/document/document.py +27 -14
  3. datamule/document/mappings/atsn.py +208 -0
  4. datamule/document/mappings/cfportal.py +346 -0
  5. datamule/document/mappings/d.py +125 -0
  6. datamule/document/mappings/ex102_abs.py +63 -0
  7. datamule/document/mappings/ex99a_sdr.py +1 -0
  8. datamule/document/mappings/ex99c_sdr.py +0 -0
  9. datamule/document/mappings/ex99g_sdr.py +0 -0
  10. datamule/document/mappings/ex99i_sdr.py +0 -0
  11. datamule/document/mappings/information_table.py +35 -0
  12. datamule/document/mappings/nmfp.py +275 -0
  13. datamule/document/mappings/npx.py +85 -0
  14. datamule/document/mappings/onefourtyfour.py +68 -0
  15. datamule/document/mappings/ownership.py +163 -0
  16. datamule/document/mappings/proxy_voting_record.py +17 -0
  17. datamule/document/mappings/sbs.py +0 -0
  18. datamule/document/mappings/sbsef.py +13 -0
  19. datamule/document/mappings/schedule13.py +117 -0
  20. datamule/document/mappings/sdr.py +63 -0
  21. datamule/document/mappings/submission_metadata.py +9 -0
  22. datamule/document/mappings/ta.py +0 -0
  23. datamule/document/mappings/thirteenfhr.py +72 -0
  24. datamule/document/mappings/twentyfivense.py +22 -0
  25. datamule/document/mappings/twentyfourf2nt.py +100 -0
  26. datamule/document/processing.py +170 -42
  27. datamule/document/table.py +60 -5
  28. datamule/helper.py +10 -1
  29. datamule/index.py +8 -10
  30. datamule/portfolio.py +17 -16
  31. datamule/sec/submissions/monitor.py +173 -120
  32. datamule/sec/submissions/textsearch.py +0 -4
  33. datamule/sec/xbrl/streamcompanyfacts.py +1 -1
  34. datamule/seclibrary/downloader.py +2 -2
  35. datamule/submission.py +92 -36
  36. {datamule-1.2.2.dist-info → datamule-1.2.9.dist-info}/METADATA +1 -2
  37. datamule-1.2.9.dist-info/RECORD +62 -0
  38. datamule/sec/rss/monitor.py +0 -416
  39. datamule-1.2.2.dist-info/RECORD +0 -40
  40. /datamule/{sec/rss → document/mappings}/__init__.py +0 -0
  41. {datamule-1.2.2.dist-info → datamule-1.2.9.dist-info}/WHEEL +0 -0
  42. {datamule-1.2.2.dist-info → datamule-1.2.9.dist-info}/top_level.txt +0 -0
@@ -1,130 +1,183 @@
1
+ import time
2
+ from collections import deque
3
+ from datetime import datetime
4
+ import xml.etree.ElementTree as ET
5
+ import re
1
6
  import asyncio
2
- from datetime import datetime, timedelta
3
- from .eftsquery import EFTSQuery # Import the class directly instead of the function
4
- from ..rss.monitor import start_monitor # Import start_monitor directly
5
- import pytz
7
+ from ..utils import headers, PreciseRateLimiter
8
+ from .eftsquery import EFTSQuery
9
+ import aiohttp
6
10
 
7
11
 
8
- async def _process_efts_hits(hits, collected_accession_numbers, data_callback=None,rate_limiter=None):
9
- """Process EFTS hits, collect accession numbers, and call data callback."""
10
- processed_hits = []
12
+ async def poll_rss(limiter):
13
+ base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
11
14
 
12
- for hit in hits:
13
- try:
14
- source = hit.get('_source', {})
15
-
16
- # Extract key fields
17
- accession_number = source.get('adsh')
18
-
19
- # Extract submission_type (form) and ciks
20
- submission_type = source.get('form')
21
- ciks = source.get('ciks', [])
22
- ciks = [str(int(cik)) for cik in ciks]
23
-
24
- filing_date = source.get('file_date')
25
-
26
- # Create standardized filing record
27
- filing = {
28
- 'accession_number': accession_number,
29
- 'submission_type': submission_type,
30
- 'ciks': ciks,
31
- 'filing_date': filing_date,
32
-
33
- }
34
-
35
- processed_hits.append(filing)
36
- collected_accession_numbers.add(accession_number) # Changed append to add for set operation
37
-
38
- except Exception as e:
39
- print(f"Error processing EFTS hit: {e}")
15
+ # Create a session specifically for this RSS polling operation
16
+ async with aiohttp.ClientSession(headers=headers) as session:
17
+ # Use the rate limiter before making the request
18
+ async with limiter:
19
+ # Make the HTTP request with the session
20
+ async with session.get(base_url) as response:
21
+ content = await response.read()
40
22
 
41
- # Call data callback if provided
42
- if data_callback and processed_hits:
43
- await data_callback(processed_hits, rate_limiter)
23
+ # Process the content
24
+ content_str = content.decode('utf-8')
25
+ root = ET.fromstring(content_str)
26
+ namespace = {'atom': 'http://www.w3.org/2005/Atom'}
27
+ entries = root.findall('atom:entry', namespace)
28
+ grouped = {}
29
+
30
+ for entry in entries:
31
+ url = entry.find('atom:link', namespace).get('href')
32
+ accession = re.search(r'/(\d{10})-(\d{2})-(\d{6})', url)
33
+ accession = accession.group(1) + accession.group(2) + accession.group(3)
34
+ cik = re.search(r'/data/(\d+)/', url).group(1)
44
35
 
45
- return processed_hits
46
-
47
- async def _master_monitor_impl(data_callback=None, poll_callback=None, submission_type=None, cik=None,
48
- polling_interval=200, requests_per_second=2.0, quiet=True, start_date=None):
49
- """Implementation of the master monitor."""
50
- # Set default start date to today if not provided (eastern)
51
- eastern_tz = pytz.timezone('US/Eastern')
52
- current_date = datetime.now(eastern_tz).strftime('%Y-%m-%d')
53
- if not start_date:
54
- start_date = current_date
36
+ if accession not in grouped:
37
+ grouped[accession] = {'submission_type': '', 'ciks': set(), 'filing_date': ''}
55
38
 
56
- # Changed from list to set for more efficient lookups
57
- collected_accession_numbers = set()
58
-
59
- if not quiet:
60
- print(f"Starting SEC monitoring from {start_date}")
61
-
62
- # Step 1: Query EFTS for all filings from start_date up to current date
63
- if not quiet:
64
- print(f"Fetching filings from {start_date} to {current_date}...")
65
-
66
- # Prepare a wrapper callback to collect accession numbers
67
- async def process_callback(hits):
68
- await _process_efts_hits(hits, collected_accession_numbers, data_callback, efts_query.limiter)
69
-
70
- # Create an EFTSQuery instance
71
- efts_query = EFTSQuery(requests_per_second=requests_per_second)
39
+ grouped[accession]['ciks'].add(cik)
40
+ grouped[accession]['submission_type'] = entry.find('atom:category', namespace).get('term')
41
+ summary_text = entry.find('atom:summary', namespace).text
42
+ filing_date_match = re.search(r'Filed:</b>\s*(\d{4}-\d{2}-\d{2})', summary_text)
43
+ if filing_date_match:
44
+ grouped[accession]['filing_date'] = filing_date_match.group(1)
45
+
46
+ results = [{'accession': int(k.replace('-', '')), 'submission_type': v['submission_type'], 'ciks': list(v['ciks']), 'filing_date': v['filing_date']} for k, v in grouped.items()]
47
+ return results
48
+
49
+ def clean_efts_hits(hits):
50
+ # clean hits
51
+ hits = [{'accession': int(hit['_source']['adsh'].replace('-','')), 'filing_date': hit['_source']['file_date'], 'ciks': hit['_source']['ciks']} for hit in hits]
52
+ return hits
53
+
54
+ class Monitor():
55
+ def __init__(self):
56
+ self.accessions = deque(maxlen=50000)
57
+ self.ratelimiters = {'sec.gov': PreciseRateLimiter(rate=5)}
58
+ self.efts_query = EFTSQuery(quiet=True)
59
+ self.efts_query.limiter = self.ratelimiters['sec.gov']
60
+
61
+ def set_domain_rate_limit(self, domain, rate):
62
+ self.ratelimiters[domain] = PreciseRateLimiter(rate=rate)
63
+ if domain == 'sec.gov':
64
+ self.efts_query.limiter = self.ratelimiters[domain]
72
65
 
73
- # Run EFTS query for the date range
74
- async with efts_query:
75
- await efts_query.query(
76
- cik=cik,
77
- submission_type=submission_type,
78
- filing_date=(start_date, current_date),
79
- callback=process_callback
66
+ async def _async_run_efts_query(self, **kwargs):
67
+ """Async helper method to run EFTS query without creating a new event loop"""
68
+ # Make sure to set quiet parameter if provided in kwargs
69
+ self.efts_query.quiet = kwargs.get('quiet', True)
70
+ return await self.efts_query.query(
71
+ cik=kwargs.get('cik'),
72
+ submission_type=kwargs.get('submission_type'),
73
+ filing_date=kwargs.get('filing_date'),
74
+ location=kwargs.get('location'),
75
+ callback=kwargs.get('callback'),
76
+ name=kwargs.get('name')
80
77
  )
78
+
79
+ async def _async_monitor_submissions(self, data_callback=None, interval_callback=None,
80
+ polling_interval=1000, quiet=True, start_date=None,
81
+ validation_interval=60000):
82
+ """
83
+ Async implementation of monitor_submissions.
84
+ """
85
+
86
+ # Backfill if start_date is provided
87
+ if start_date is not None:
88
+ today_date = datetime.now().date().strftime('%Y-%m-%d')
89
+ if not quiet:
90
+ print(f"Backfilling from {start_date} to {today_date}")
91
+
92
+ hits = clean_efts_hits(await self._async_run_efts_query(
93
+ filing_date=(start_date, today_date),
94
+ quiet=quiet
95
+ ))
96
+
97
+ new_hits = self._filter_new_accessions(hits)
98
+ if not quiet:
99
+ print(f"New submissions found: {len(new_hits)}")
100
+ if new_hits and data_callback:
101
+ data_callback(new_hits)
102
+
103
+ last_polling_time = time.time()
104
+ last_validation_time = last_polling_time
105
+ current_time = last_polling_time
106
+
107
+ while True:
108
+ # RSS polling
109
+ if not quiet:
110
+ print(f"Polling RSS feed")
111
+ results = await poll_rss(self.ratelimiters['sec.gov'])
112
+ new_results = self._filter_new_accessions(results)
113
+ if new_results:
114
+ if not quiet:
115
+ print(f"Found {len(new_results)} new submissions via RSS")
116
+ if data_callback:
117
+ data_callback(new_results)
118
+
119
+ # EFTS validation
120
+ if validation_interval and (current_time - last_validation_time) >= validation_interval/1000:
121
+ # Get submissions from the last 24 hours for validation
122
+ today_date = datetime.now().strftime('%Y-%m-%d')
123
+ if not quiet:
124
+ print(f"Validating submissions from {today_date}")
125
+
126
+ hits = clean_efts_hits(await self._async_run_efts_query(
127
+ filing_date=(today_date, today_date),
128
+ quiet=quiet
129
+ ))
130
+
131
+ new_hits = self._filter_new_accessions(hits)
132
+ if new_hits:
133
+ if not quiet:
134
+ print(f"Found {len(new_hits)} new submissions via EFTS validation")
135
+ if data_callback:
136
+ data_callback(new_hits)
137
+ last_polling_time = time.time()
138
+ last_validation_time = current_time
139
+
140
+ # Interval callback
141
+ if interval_callback:
142
+ interval_callback()
143
+
144
+ next_poll_time = last_polling_time + (polling_interval / 1000)
145
+ current_time = time.time()
146
+ time_to_sleep = max(0, next_poll_time - current_time)
147
+ await asyncio.sleep(time_to_sleep)
148
+ last_polling_time = next_poll_time
149
+
150
+
151
+ def monitor_submissions(self, data_callback=None, interval_callback=None,
152
+ polling_interval=1000, quiet=True, start_date=None,
153
+ validation_interval=60000):
154
+ """
155
+ Monitor SEC submissions using the EDGAR system.
156
+ :param data_callback: function to call with the data
157
+ :param interval_callback: function that executes between polls
158
+ :param polling_interval: interval between polls in milliseconds
159
+ :param quiet: if True, suppresses output
160
+ :param start_date: backfill start date in YYYY-MM-DD format
161
+ :param validation_interval: interval between validation in milliseconds
162
+
163
+ This function combines the speed of the RSS feed (fast, but misses some submissions) with the accuracy of the EFTS system.
164
+ """
165
+ # This is now a synchronous wrapper around the async implementation
166
+ return asyncio.run(self._async_monitor_submissions(
167
+ data_callback=data_callback,
168
+ interval_callback=interval_callback,
169
+ polling_interval=polling_interval,
170
+ quiet=quiet,
171
+ start_date=start_date,
172
+ validation_interval=validation_interval
173
+ ))
81
174
 
82
- if not quiet:
83
- print(f"Historical query complete. Collected {len(collected_accession_numbers)} accession numbers.")
84
-
85
- # Step 2: Hand off to RSS monitor with collected accession numbers
86
- if not quiet:
87
- print("Starting real-time RSS monitoring...")
88
-
89
- # Start RSS monitor with the set of accession numbers to skip (from EFTS)
90
- # and an empty list for ongoing tracking
91
- await start_monitor(
92
- data_callback=data_callback,
93
- poll_callback=poll_callback,
94
- submission_type=submission_type,
95
- cik=cik,
96
- polling_interval=polling_interval,
97
- requests_per_second=requests_per_second,
98
- quiet=quiet,
99
- known_accession_numbers=[], # Start with an empty list for ongoing tracking
100
- skip_initial_accession_numbers=collected_accession_numbers # Pass the EFTS accession numbers as the skip list
101
- )
102
-
103
- def monitor(data_callback=None, poll_callback=None, submission_type=None, cik=None,
104
- polling_interval=200, requests_per_second=2.0, quiet=True, start_date=None):
105
- """
106
- Monitor SEC filings by combining EFTS historical queries with real-time RSS monitoring.
107
-
108
- Parameters:
109
- data_callback (callable): Async function to call when new filings are found.
110
- Will be called with a list of dicts containing
111
- 'accession_number', 'submission_type', and 'ciks'.
112
- poll_callback (callable): Async function to call during RSS polling wait periods.
113
- submission_type (str or list): Form type(s) to monitor (e.g., "8-K", "10-Q").
114
- cik (str or list): CIK(s) to monitor.
115
- polling_interval (int): Polling interval in milliseconds for RSS monitor.
116
- requests_per_second (float): Maximum requests per second.
117
- quiet (bool): Suppress verbose output.
118
- start_date (str): ISO format date (YYYY-MM-DD) from which to start monitoring.
119
- If None, will start from current date. (EASTERN TIME)
120
- """
121
- return asyncio.run(_master_monitor_impl(
122
- data_callback=data_callback,
123
- poll_callback=poll_callback,
124
- submission_type=submission_type,
125
- cik=cik,
126
- polling_interval=polling_interval,
127
- requests_per_second=requests_per_second,
128
- quiet=quiet,
129
- start_date=start_date
130
- ))
175
+ def _filter_new_accessions(self, items):
176
+ """Filter items to only include those with new accession numbers."""
177
+ new_items = []
178
+ for item in items:
179
+ accession = item['accession']
180
+ if accession not in self.accessions:
181
+ self.accessions.append(accession)
182
+ new_items.append(item)
183
+ return new_items
@@ -1,8 +1,4 @@
1
1
  import asyncio
2
- import aiohttp
3
- from datetime import datetime
4
- from urllib.parse import urlencode
5
- from tqdm import tqdm
6
2
  from .eftsquery import EFTSQuery
7
3
 
8
4
  class TextSearchEFTSQuery(EFTSQuery):
@@ -2,7 +2,7 @@ import asyncio
2
2
  import aiohttp
3
3
  import json
4
4
  from tqdm import tqdm
5
- from ..utils import PreciseRateLimiter, RateMonitor, RetryException, headers
5
+ from ..utils import PreciseRateLimiter, RateMonitor, headers
6
6
 
7
7
  async def fetch_company_facts(session, cik, rate_limiter, rate_monitor, pbar):
8
8
  # Format CIK with leading zeros to 10 digits
@@ -1,7 +1,6 @@
1
1
  import os
2
2
  import asyncio
3
3
  import aiohttp
4
- from pathlib import Path
5
4
  from tqdm import tqdm
6
5
  import time
7
6
  import shutil
@@ -13,11 +12,12 @@ from concurrent.futures import ThreadPoolExecutor
13
12
  from functools import partial
14
13
  from queue import Queue, Empty
15
14
  from threading import Thread
16
- from secsgml import parse_sgml_submission
17
15
  from .query import query
18
16
  from os import cpu_count
19
17
  from ..submission import Submission
20
18
 
19
+
20
+
21
21
  class Downloader:
22
22
  def __init__(self, api_key=None):
23
23
  self.BASE_URL = "https://library.datamule.xyz/original/nc/"
datamule/submission.py CHANGED
@@ -4,6 +4,70 @@ from .document.document import Document
4
4
  from secsgml import parse_sgml_submission_into_memory
5
5
  import os
6
6
  import aiofiles
7
+ import tempfile
8
+
9
+
10
+ # # NEW CODE YAY. probably will remove
11
+
12
+ # def save_metadata_atomically(metadata_file_path, metadata_content):
13
+ # """Save metadata to a JSONL file atomically, works on any filesystem"""
14
+
15
+ # # Create directory if it doesn't exist
16
+ # os.makedirs(os.path.dirname(metadata_file_path), exist_ok=True)
17
+
18
+ # # Format the JSON with newline
19
+ # json_str = json.dumps(metadata_content, indent=4) + "\n"
20
+
21
+ # # Write complete content to a temporary file first
22
+ # fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(metadata_file_path))
23
+ # try:
24
+ # with os.fdopen(fd, 'w') as temp_file:
25
+ # temp_file.write(json_str)
26
+ # temp_file.flush()
27
+ # os.fsync(temp_file.fileno()) # Force write to disk
28
+
29
+ # # Append the temporary file to the main file
30
+ # with open(metadata_file_path, 'a') as target_file:
31
+ # with open(temp_path, 'r') as temp_read:
32
+ # content = temp_read.read()
33
+ # target_file.write(content)
34
+ # target_file.flush()
35
+ # os.fsync(target_file.fileno()) # Force write to disk
36
+ # finally:
37
+ # # Clean up the temporary file
38
+ # if os.path.exists(temp_path):
39
+ # os.unlink(temp_path)
40
+
41
+ # async def save_metadata_atomically_async(metadata_file_path, metadata_content):
42
+ # """Save metadata to a JSONL file atomically in async mode"""
43
+
44
+ # # Create directory if it doesn't exist
45
+ # os.makedirs(os.path.dirname(metadata_file_path), exist_ok=True)
46
+
47
+ # # Format the JSON with newline
48
+ # json_str = json.dumps(metadata_content, indent=4) + "\n"
49
+
50
+ # # Write to a temporary file first
51
+ # fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(metadata_file_path))
52
+ # os.close(fd) # Close the file descriptor
53
+
54
+ # try:
55
+ # async with aiofiles.open(temp_path, 'w') as temp_file:
56
+ # await temp_file.write(json_str)
57
+ # await temp_file.flush()
58
+
59
+ # # Append the temporary file to the main file
60
+ # async with aiofiles.open(metadata_file_path, 'a') as target_file:
61
+ # async with aiofiles.open(temp_path, 'r') as temp_read:
62
+ # content = await temp_read.read()
63
+ # await target_file.write(content)
64
+ # await target_file.flush()
65
+ # finally:
66
+ # # Clean up the temporary file
67
+ # if os.path.exists(temp_path):
68
+ # os.unlink(temp_path)
69
+
70
+ # # END OF NEW CODE
7
71
 
8
72
 
9
73
  class Submission:
@@ -15,15 +79,17 @@ class Submission:
15
79
 
16
80
  if sgml_content is not None:
17
81
  self.path = None
18
- self.metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
82
+ metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
83
+ self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
19
84
 
20
- self.accession = self.metadata['accession-number']
21
- self.filing_date= f"{self.metadata['filing-date'][:4]}-{self.metadata['filing-date'][4:6]}-{self.metadata['filing-date'][6:8]}"
85
+ # code dupe
86
+ self.accession = self.metadata.content['accession-number']
87
+ self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
22
88
 
23
89
  self.documents = []
24
90
  filtered_metadata_documents = []
25
91
 
26
- for idx,doc in enumerate(self.metadata['documents']):
92
+ for idx,doc in enumerate(self.metadata.content['documents']):
27
93
  type = doc.get('type')
28
94
 
29
95
  # Keep only specified types
@@ -35,13 +101,19 @@ class Submission:
35
101
 
36
102
  filtered_metadata_documents.append(doc)
37
103
 
38
- self.metadata['documents'] = filtered_metadata_documents
104
+ self.metadata.content['documents'] = filtered_metadata_documents
39
105
 
40
106
  if path is not None:
41
107
  self.path = Path(path)
42
108
  metadata_path = self.path / 'metadata.json'
43
109
  with metadata_path.open('r') as f:
44
- self.metadata = json.load(f)
110
+ metadata = json.load(f)
111
+ self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
112
+
113
+ # Code dupe
114
+ self.accession = self.metadata.content['accession-number']
115
+ self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
116
+
45
117
 
46
118
 
47
119
 
@@ -52,7 +124,7 @@ class Submission:
52
124
  else:
53
125
  document_types = document_type
54
126
 
55
- for idx,doc in enumerate(self.metadata['documents']):
127
+ for idx,doc in enumerate(self.metadata.content['documents']):
56
128
  if doc['type'] in document_types:
57
129
 
58
130
  # if loaded from path
@@ -65,9 +137,12 @@ class Submission:
65
137
  document_path = self.path / filename
66
138
  extension = document_path.suffix
67
139
 
68
- with document_path.open('r') as f:
140
+ with document_path.open('rb') as f:
69
141
  content = f.read()
70
142
 
143
+ if extension in ['.htm','.html','.txt','.xml']:
144
+ content = content.decode('utf-8', errors='replace')
145
+
71
146
  yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
72
147
  # if loaded from sgml_content
73
148
  else:
@@ -75,7 +150,7 @@ class Submission:
75
150
 
76
151
 
77
152
  def __iter__(self):
78
- for idx,doc in enumerate(self.metadata['documents']):
153
+ for idx,doc in enumerate(self.metadata.content['documents']):
79
154
  # if loaded from path
80
155
  if self.path is not None:
81
156
  filename = doc.get('filename')
@@ -89,9 +164,12 @@ class Submission:
89
164
 
90
165
  # check if the file exists
91
166
  if document_path.exists():
92
- with document_path.open('r') as f:
167
+ with document_path.open('rb') as f:
93
168
  content = f.read()
94
169
 
170
+ if extension in ['.htm','.html','.txt','.xml']:
171
+ content = content.decode('utf-8', errors='replace')
172
+
95
173
  yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
96
174
  else:
97
175
  print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
@@ -100,28 +178,6 @@ class Submission:
100
178
  else:
101
179
  yield self.documents[idx]
102
180
 
103
- # keep documents by document type
104
- def keep(self, document_type):
105
- # Convert single document type to list for consistent handling
106
- if isinstance(document_type, str):
107
- document_types = [document_type]
108
- else:
109
- document_types = document_type
110
-
111
- if self.path is not None:
112
- for doc in self.metadata['documents']:
113
- filename = doc.get('filename')
114
- type = doc.get('type')
115
- if type not in document_types:
116
- # oh we need handling here for sequences case
117
- if filename is None:
118
- filename = doc.sequence + '.txt'
119
-
120
- document_path = self.path / filename
121
- # delete the file
122
- document_path.unlink()
123
- else:
124
- print("Warning: keep() method is only available when loading from path.")
125
181
 
126
182
 
127
183
 
@@ -131,9 +187,9 @@ class Submission:
131
187
 
132
188
  metadata_path = file_dir / "metadata.json"
133
189
  with open(metadata_path, 'w') as f:
134
- json.dump(self.metadata, f, indent=4)
190
+ json.dump(self.metadata.content, f, indent=4)
135
191
 
136
- for idx, doc in enumerate(self.metadata['documents']):
192
+ for idx, doc in enumerate(self.metadata.content['documents']):
137
193
  try:
138
194
  filename = doc.get('filename')
139
195
  if filename is None:
@@ -172,9 +228,9 @@ class Submission:
172
228
 
173
229
  metadata_path = file_dir / "metadata.json"
174
230
  async with aiofiles.open(metadata_path, 'w') as f:
175
- await f.write(json.dumps(self.metadata, indent=4))
231
+ await f.write(json.dumps(self.metadata.content, indent=4))
176
232
 
177
- for idx, doc in enumerate(self.metadata['documents']):
233
+ for idx, doc in enumerate(self.metadata.content['documents']):
178
234
  try:
179
235
  filename = doc.get('filename')
180
236
  if filename is None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.2.2
3
+ Version: 1.2.9
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -10,7 +10,6 @@ Requires-Dist: tqdm
10
10
  Requires-Dist: requests
11
11
  Requires-Dist: nest-asyncio
12
12
  Requires-Dist: aiofiles
13
- Requires-Dist: polars
14
13
  Requires-Dist: setuptools
15
14
  Requires-Dist: selectolax
16
15
  Requires-Dist: pytz
@@ -0,0 +1,62 @@
1
+ datamule/__init__.py,sha256=glzwBeGJEE6-TG7mRule9GH6L59XaIRR9T7ALcdpMus,1067
2
+ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
+ datamule/helper.py,sha256=g9Kb1DWbViCoON06PjOkSX5Ucu0uG7zPwhsO2LQ6C1g,3579
4
+ datamule/index.py,sha256=_7Ox5hyF_7RWdblVFr5rNyv_ARwBP7VY4f703pk9qQ8,2074
5
+ datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
6
+ datamule/portfolio.py,sha256=8fiK-vfZM5-NJSvOEsDR2YDb-2njjzFk6l7BiRyrzOM,7168
7
+ datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
8
+ datamule/submission.py,sha256=Yh5nG3ioumhl6z30wJdIEmKjDDNSuo0r2xycZSIaeIg,11035
9
+ datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ datamule/document/document.py,sha256=menUFoeWwiY0rJnBkQiqY4NWnO0J17-qs8jFvO_1jiY,9969
11
+ datamule/document/processing.py,sha256=jDCEzBFDSQtq7nQxRScIsbALnFcvMPOkNkMUCa7mFxg,31921
12
+ datamule/document/table.py,sha256=73yUJKY82ap32jhLmZeTti-jQ_lyhcJGlGwyxLtgYOg,12944
13
+ datamule/document/mappings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ datamule/document/mappings/atsn.py,sha256=qkZGNIhyPC3VTTOjQ8-FSCQIhUy4XeSycUGLShxNVCo,17743
15
+ datamule/document/mappings/cfportal.py,sha256=bR9d6DDY0kJ_HGx_hND2y1PNNkZjemYZ2KdyFAcv760,25257
16
+ datamule/document/mappings/d.py,sha256=ayRK-bTzelNH6fspp-n3gz6RXOrHVx6IjX-TmisrFe4,7714
17
+ datamule/document/mappings/ex102_abs.py,sha256=FdGKvteRh_HsYgILF-8o4R6aSsjYwcaLpJxzdru4FTE,3976
18
+ datamule/document/mappings/ex99a_sdr.py,sha256=PNdj9I0ZhNicPObLelNmjp33EgTwzvukqkBDnwxarE0,19
19
+ datamule/document/mappings/ex99c_sdr.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ datamule/document/mappings/ex99g_sdr.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ datamule/document/mappings/ex99i_sdr.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ datamule/document/mappings/information_table.py,sha256=6l2Via728I59RS0y9Pit37NoOSAbaT-vclArYxU1vtY,1585
23
+ datamule/document/mappings/nmfp.py,sha256=WuTyM1SkBiiLVAHqFF4DTZ_8AvsIuonT2w7pwYDPTDw,17767
24
+ datamule/document/mappings/npx.py,sha256=xwruBueC09kfWhXV3fNUnQWYwCWrdrhQoVO3cKfPTO4,6556
25
+ datamule/document/mappings/onefourtyfour.py,sha256=_-w9h6wGINGH5pQqQvPrd0cgB5QfCtPG5M40ewf_w8Q,2604
26
+ datamule/document/mappings/ownership.py,sha256=piD9vs4WFrB4yvp6c0pT5bibLKXgsM7hpnBUzaY0Xxs,10155
27
+ datamule/document/mappings/proxy_voting_record.py,sha256=tSqLH065EOUq7U80P5GP1JBqipmAiqniPpP3E4adA1A,721
28
+ datamule/document/mappings/sbs.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
+ datamule/document/mappings/sbsef.py,sha256=Zw58rbYcnitynk1mh9g1jDrCfqmFlY60OEjPM6p9iF0,534
30
+ datamule/document/mappings/schedule13.py,sha256=lh9sukpEte514Gid77Nz9zh3uBEFZEemrZ2Uau0qsgk,6295
31
+ datamule/document/mappings/sdr.py,sha256=UekqZId5PFMMWRAJSaPvCpN4c1Hx-SLAQPEN8GW_Gbg,4829
32
+ datamule/document/mappings/submission_metadata.py,sha256=pi1eW-tnoAQ6y3laRI29Op80E9BPqqmcfe45owKYStw,271
33
+ datamule/document/mappings/ta.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ datamule/document/mappings/thirteenfhr.py,sha256=XpYRIMPZnGLfEE4TqBI0BPXbyuq0xf3hut1fePOF6kU,4250
35
+ datamule/document/mappings/twentyfivense.py,sha256=lKyj0ZBhkHX9gQJMTUPrQlxYFg3k-aBnWqtoS5bujZM,905
36
+ datamule/document/mappings/twentyfourf2nt.py,sha256=Q7RPT3JgJHjYdjMuaSyAxclt6QPT_LgCQloxp-ByDuI,4118
37
+ datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
+ datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
39
+ datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
40
+ datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ datamule/sec/utils.py,sha256=JUxwijJiqRMnRJNQzVUamyF5h9ZGc7RnO_zsLOIM73g,2079
42
+ datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
+ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNsBw5Jv0Tx5aljiGUJkk7DRk,18745
44
+ datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
+ datamule/sec/submissions/downloader.py,sha256=60wX2Yml1UCuxOtU0xMxqqeyHhrypCmlDQ0jZF-StJo,2665
46
+ datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
47
+ datamule/sec/submissions/monitor.py,sha256=s6uknn1dF1EemiI3Hl4nEq3txwK7nYl6wmayuUPYpRs,7844
48
+ datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
49
+ datamule/sec/submissions/textsearch.py,sha256=zEr3NXdhVFL8eMh2jruVXIt7taUZTMdNy2hOAyRM2pA,5706
50
+ datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
+ datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
52
+ datamule/sec/xbrl/filter_xbrl.py,sha256=g9OT4zrNS0tiUJeBIwbCs_zMisOBkpFnMR3tV4Tr39Q,1316
53
+ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H5N2tbvKUzM,3307
54
+ datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
55
+ datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
+ datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
57
+ datamule/seclibrary/downloader.py,sha256=PIgz_7ASUTZOHcUZGcD1SmLaGSbq7xe7EiJT0Z7HU4M,13653
58
+ datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
59
+ datamule-1.2.9.dist-info/METADATA,sha256=5bMwIRcARNqP6S1cdPzoIBuu1miiUJUUdWnTXvwtPNk,490
60
+ datamule-1.2.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
61
+ datamule-1.2.9.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
62
+ datamule-1.2.9.dist-info/RECORD,,