datamule 1.5.3__tar.gz → 1.5.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {datamule-1.5.3 → datamule-1.5.5}/PKG-INFO +1 -1
  2. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/submissions/monitor.py +45 -25
  3. {datamule-1.5.3 → datamule-1.5.5}/datamule/seclibrary/downloader.py +82 -0
  4. {datamule-1.5.3 → datamule-1.5.5}/datamule/submission.py +8 -0
  5. {datamule-1.5.3 → datamule-1.5.5}/datamule.egg-info/PKG-INFO +1 -1
  6. {datamule-1.5.3 → datamule-1.5.5}/setup.py +1 -1
  7. {datamule-1.5.3 → datamule-1.5.5}/datamule/__init__.py +0 -0
  8. {datamule-1.5.3 → datamule-1.5.5}/datamule/config.py +0 -0
  9. {datamule-1.5.3 → datamule-1.5.5}/datamule/data/listed_filer_metadata.csv +0 -0
  10. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/__init__.py +0 -0
  11. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/document.py +0 -0
  12. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/__init__.py +0 -0
  13. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/atsn.py +0 -0
  14. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/cfportal.py +0 -0
  15. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/d.py +0 -0
  16. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/ex102_abs.py +0 -0
  17. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/ex99a_sdr.py +0 -0
  18. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/ex99c_sdr.py +0 -0
  19. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/ex99g_sdr.py +0 -0
  20. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/ex99i_sdr.py +0 -0
  21. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/information_table.py +0 -0
  22. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/nmfp.py +0 -0
  23. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/npx.py +0 -0
  24. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/onefourtyfour.py +0 -0
  25. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/ownership.py +0 -0
  26. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/proxy_voting_record.py +0 -0
  27. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/sbs.py +0 -0
  28. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/sbsef.py +0 -0
  29. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/schedule13.py +0 -0
  30. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/sdr.py +0 -0
  31. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/submission_metadata.py +0 -0
  32. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/ta.py +0 -0
  33. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/thirteenfhr.py +0 -0
  34. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/twentyfivense.py +0 -0
  35. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/mappings/twentyfourf2nt.py +0 -0
  36. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/processing.py +0 -0
  37. {datamule-1.5.3 → datamule-1.5.5}/datamule/document/table.py +0 -0
  38. {datamule-1.5.3 → datamule-1.5.5}/datamule/helper.py +0 -0
  39. {datamule-1.5.3 → datamule-1.5.5}/datamule/index.py +0 -0
  40. {datamule-1.5.3 → datamule-1.5.5}/datamule/mapping_dicts/__init__.py +0 -0
  41. {datamule-1.5.3 → datamule-1.5.5}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  42. {datamule-1.5.3 → datamule-1.5.5}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  43. {datamule-1.5.3 → datamule-1.5.5}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  44. {datamule-1.5.3 → datamule-1.5.5}/datamule/package_updater.py +0 -0
  45. {datamule-1.5.3 → datamule-1.5.5}/datamule/portfolio.py +0 -0
  46. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/__init__.py +0 -0
  47. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/infrastructure/__init__.py +0 -0
  48. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  49. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/submissions/__init__.py +0 -0
  50. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/submissions/downloader.py +0 -0
  51. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/submissions/eftsquery.py +0 -0
  52. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/submissions/streamer.py +0 -0
  53. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/submissions/textsearch.py +0 -0
  54. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/utils.py +0 -0
  55. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/xbrl/__init__.py +0 -0
  56. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  57. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  58. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  59. {datamule-1.5.3 → datamule-1.5.5}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  60. {datamule-1.5.3 → datamule-1.5.5}/datamule/seclibrary/__init__.py +0 -0
  61. {datamule-1.5.3 → datamule-1.5.5}/datamule/seclibrary/bq.py +0 -0
  62. {datamule-1.5.3 → datamule-1.5.5}/datamule/seclibrary/query.py +0 -0
  63. {datamule-1.5.3 → datamule-1.5.5}/datamule/sheet.py +0 -0
  64. {datamule-1.5.3 → datamule-1.5.5}/datamule.egg-info/SOURCES.txt +0 -0
  65. {datamule-1.5.3 → datamule-1.5.5}/datamule.egg-info/dependency_links.txt +0 -0
  66. {datamule-1.5.3 → datamule-1.5.5}/datamule.egg-info/requires.txt +0 -0
  67. {datamule-1.5.3 → datamule-1.5.5}/datamule.egg-info/top_level.txt +0 -0
  68. {datamule-1.5.3 → datamule-1.5.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.5.3
3
+ Version: 1.5.5
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -7,7 +7,7 @@ import asyncio
7
7
  from ..utils import headers, PreciseRateLimiter
8
8
  from .eftsquery import EFTSQuery
9
9
  import aiohttp
10
-
10
+ from zoneinfo import ZoneInfo
11
11
 
12
12
  async def poll_rss(limiter):
13
13
  base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
@@ -77,15 +77,22 @@ class Monitor():
77
77
  )
78
78
 
79
79
  async def _async_monitor_submissions(self, data_callback=None, interval_callback=None,
80
- polling_interval=1000, quiet=True, start_date=None,
81
- validation_interval=60000):
80
+ polling_interval=1000, quiet=True, start_date=None,
81
+ validation_interval=60000):
82
82
  """
83
83
  Async implementation of monitor_submissions.
84
+ Either polling_interval or validation_interval (or both) must be specified.
85
+ If polling_interval is None, only EFTS validation will be performed.
86
+ If validation_interval is None, only RSS polling will be performed.
84
87
  """
85
88
 
89
+ # Validate that at least one interval is specified
90
+ if polling_interval is None and validation_interval is None:
91
+ raise ValueError("At least one of polling_interval or validation_interval must be specified")
92
+
86
93
  # Backfill if start_date is provided
87
94
  if start_date is not None:
88
- today_date = datetime.now().date().strftime('%Y-%m-%d')
95
+ today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
89
96
  if not quiet:
90
97
  print(f"Backfilling from {start_date} to {today_date}")
91
98
 
@@ -100,26 +107,35 @@ class Monitor():
100
107
  if new_hits and data_callback:
101
108
  data_callback(new_hits)
102
109
 
103
- last_polling_time = time.time()
104
- last_validation_time = last_polling_time
105
- current_time = last_polling_time
106
-
110
+ # Initialize timing variables
111
+ current_time = time.time()
112
+ last_polling_time = current_time
113
+ last_validation_time = current_time
114
+
115
+ # Determine which operations to perform
116
+ do_polling = polling_interval is not None
117
+ do_validation = validation_interval is not None
118
+
107
119
  while True:
108
- # RSS polling
109
- if not quiet:
110
- print(f"Polling RSS feed")
111
- results = await poll_rss(self.ratelimiters['sec.gov'])
112
- new_results = self._filter_new_accessions(results)
113
- if new_results:
120
+ current_time = time.time()
121
+
122
+ # RSS polling (if enabled)
123
+ if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
114
124
  if not quiet:
115
- print(f"Found {len(new_results)} new submissions via RSS")
116
- if data_callback:
117
- data_callback(new_results)
125
+ print(f"Polling RSS feed")
126
+ results = await poll_rss(self.ratelimiters['sec.gov'])
127
+ new_results = self._filter_new_accessions(results)
128
+ if new_results:
129
+ if not quiet:
130
+ print(f"Found {len(new_results)} new submissions via RSS")
131
+ if data_callback:
132
+ data_callback(new_results)
133
+ last_polling_time = current_time
118
134
 
119
- # EFTS validation
120
- if validation_interval and (current_time - last_validation_time) >= validation_interval/1000:
135
+ # EFTS validation (if enabled)
136
+ if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
121
137
  # Get submissions from the last 24 hours for validation
122
- today_date = datetime.now().strftime('%Y-%m-%d')
138
+ today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
123
139
  if not quiet:
124
140
  print(f"Validating submissions from {today_date}")
125
141
 
@@ -134,19 +150,23 @@ class Monitor():
134
150
  print(f"Found {len(new_hits)} new submissions via EFTS validation")
135
151
  if data_callback:
136
152
  data_callback(new_hits)
137
- last_polling_time = time.time()
138
153
  last_validation_time = current_time
139
154
 
140
155
  # Interval callback
141
156
  if interval_callback:
142
157
  interval_callback()
143
158
 
144
- next_poll_time = last_polling_time + (polling_interval / 1000)
159
+ # Calculate next wake-up time
160
+ next_times = []
161
+ if do_polling:
162
+ next_times.append(last_polling_time + (polling_interval / 1000))
163
+ if do_validation:
164
+ next_times.append(last_validation_time + (validation_interval / 1000))
165
+
166
+ next_wake_time = min(next_times)
145
167
  current_time = time.time()
146
- time_to_sleep = max(0, next_poll_time - current_time)
168
+ time_to_sleep = max(0, next_wake_time - current_time)
147
169
  await asyncio.sleep(time_to_sleep)
148
- last_polling_time = next_poll_time
149
-
150
170
 
151
171
  def monitor_submissions(self, data_callback=None, interval_callback=None,
152
172
  polling_interval=1000, quiet=True, start_date=None,
@@ -98,6 +98,7 @@ class Downloader:
98
98
  filename, content = item
99
99
  output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
100
100
  write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types, keep_filtered_metadata=self.keep_filtered_metadata,standardize_metadata=self.standardize_metadata)
101
+
101
102
  self.pbar.update(1)
102
103
 
103
104
  def _processing_worker(self):
@@ -296,6 +297,64 @@ class Downloader:
296
297
  self.loop.call_soon_threadsafe(self.loop.stop)
297
298
 
298
299
 
300
+
301
+ def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
302
+ """
303
+ Download and process SEC filings using specific filenames.
304
+
305
+ Parameters:
306
+ - filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
307
+ - output_dir: Directory to save downloaded files
308
+ - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
309
+ - keep_filtered_metadata: Whether to keep metadata for filtered documents
310
+ - standardize_metadata: Whether to standardize metadata format
311
+ """
312
+ if self.api_key is None:
313
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
314
+
315
+ if not filenames:
316
+ raise ValueError("No filenames provided")
317
+
318
+ if not isinstance(filenames, (list, tuple)):
319
+ filenames = [filenames]
320
+
321
+ # Validate filenames format
322
+ for filename in filenames:
323
+ if not isinstance(filename, str):
324
+ raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
325
+ if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
326
+ raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
327
+
328
+ # Generate URLs directly from filenames
329
+ print(f"Generating URLs for {len(filenames)} files...")
330
+ urls = []
331
+ for filename in filenames:
332
+ url = f"{self.BASE_URL}{filename}"
333
+ urls.append(url)
334
+
335
+ # Remove duplicates while preserving order
336
+ seen = set()
337
+ urls = [url for url in urls if not (url in seen or seen.add(url))]
338
+
339
+ print(f"Downloading {len(urls)} files...")
340
+
341
+ # Process the batch asynchronously using existing infrastructure
342
+ start_time = time.time()
343
+
344
+ asyncio.run(self.process_batch(
345
+ urls,
346
+ output_dir,
347
+ keep_document_types=keep_document_types,
348
+ keep_filtered_metadata=keep_filtered_metadata,
349
+ standardize_metadata=standardize_metadata
350
+ ))
351
+
352
+ # Calculate and display performance metrics
353
+ elapsed_time = time.time() - start_time
354
+ print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
355
+ print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
356
+
357
+
299
358
  def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
300
359
  """
301
360
  Query SEC filings and download/process them.
@@ -325,4 +384,27 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
325
384
  keep_document_types=keep_document_types,
326
385
  keep_filtered_metadata=keep_filtered_metadata,
327
386
  standardize_metadata=standardize_metadata
387
+ )
388
+
389
+ def download_files_using_filename(filenames, api_key=None, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
390
+ """
391
+ Download and process SEC filings using specific filenames.
392
+
393
+ Parameters:
394
+ - filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
395
+ - api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
396
+ - output_dir: Directory to save downloaded files
397
+ - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
398
+ - keep_filtered_metadata: Whether to keep metadata for filtered documents
399
+ - standardize_metadata: Whether to standardize metadata format
400
+ """
401
+ downloader = Downloader(api_key=api_key)
402
+ downloader.QUEUE_SIZE = 1
403
+ downloader.MAX_CONCURRENT_DOWNLOADS = 1
404
+ downloader.download_files_using_filename(
405
+ filenames=filenames,
406
+ output_dir=output_dir,
407
+ keep_document_types=keep_document_types,
408
+ keep_filtered_metadata=keep_filtered_metadata,
409
+ standardize_metadata=standardize_metadata
328
410
  )
@@ -3,6 +3,7 @@ import json
3
3
  from .document.document import Document
4
4
  from secsgml import parse_sgml_content_into_memory
5
5
  from secsgml.utils import bytes_to_str
6
+ from secsgml.parse_sgml import transform_metadata_string
6
7
  import tarfile
7
8
  import shutil
8
9
  import zstandard as zstd
@@ -86,6 +87,10 @@ class Submission:
86
87
  if sgml_content is not None:
87
88
  self.path = None
88
89
  metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
90
+
91
+ # standardize metadata
92
+ metadata = transform_metadata_string(metadata)
93
+
89
94
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
90
95
  # code dupe
91
96
  self.accession = self.metadata.content['accession-number']
@@ -123,6 +128,9 @@ class Submission:
123
128
  metadata_path = self.path / 'metadata.json'
124
129
  with metadata_path.open('r') as f:
125
130
  metadata = json.load(f)
131
+
132
+ # standardize metadata
133
+ metadata = transform_metadata_string(metadata)
126
134
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
127
135
  self.accession = self.metadata.content['accession-number']
128
136
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.5.3
3
+ Version: 1.5.5
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="1.5.3",
35
+ version="1.5.5",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes