datamule 1.0.9__tar.gz → 1.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {datamule-1.0.9 → datamule-1.1.1}/PKG-INFO +1 -1
  2. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/infrastructure/submissions_metadata.py +29 -8
  3. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/submissions/eftsquery.py +37 -22
  4. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/submissions/textsearch.py +10 -6
  5. {datamule-1.0.9 → datamule-1.1.1}/datamule.egg-info/PKG-INFO +1 -1
  6. {datamule-1.0.9 → datamule-1.1.1}/setup.py +1 -1
  7. {datamule-1.0.9 → datamule-1.1.1}/datamule/__init__.py +0 -0
  8. {datamule-1.0.9 → datamule-1.1.1}/datamule/book/__init__.py +0 -0
  9. {datamule-1.0.9 → datamule-1.1.1}/datamule/book/book.py +0 -0
  10. {datamule-1.0.9 → datamule-1.1.1}/datamule/config.py +0 -0
  11. {datamule-1.0.9 → datamule-1.1.1}/datamule/document.py +0 -0
  12. {datamule-1.0.9 → datamule-1.1.1}/datamule/helper.py +0 -0
  13. {datamule-1.0.9 → datamule-1.1.1}/datamule/mapping_dicts/__init__.py +0 -0
  14. {datamule-1.0.9 → datamule-1.1.1}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  15. {datamule-1.0.9 → datamule-1.1.1}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  16. {datamule-1.0.9 → datamule-1.1.1}/datamule/portfolio.py +0 -0
  17. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/__init__.py +0 -0
  18. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/infrastructure/__init__.py +0 -0
  19. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/rss/__init__.py +0 -0
  20. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/rss/monitor.py +0 -0
  21. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/submissions/__init__.py +0 -0
  22. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/submissions/downloader.py +0 -0
  23. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/submissions/monitor.py +0 -0
  24. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/submissions/streamer.py +0 -0
  25. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/utils.py +0 -0
  26. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/xbrl/__init__.py +0 -0
  27. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  28. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  29. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  30. {datamule-1.0.9 → datamule-1.1.1}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  31. {datamule-1.0.9 → datamule-1.1.1}/datamule/seclibrary/__init__.py +0 -0
  32. {datamule-1.0.9 → datamule-1.1.1}/datamule/seclibrary/downloader.py +0 -0
  33. {datamule-1.0.9 → datamule-1.1.1}/datamule/seclibrary/query.py +0 -0
  34. {datamule-1.0.9 → datamule-1.1.1}/datamule/submission.py +0 -0
  35. {datamule-1.0.9 → datamule-1.1.1}/datamule.egg-info/SOURCES.txt +0 -0
  36. {datamule-1.0.9 → datamule-1.1.1}/datamule.egg-info/dependency_links.txt +0 -0
  37. {datamule-1.0.9 → datamule-1.1.1}/datamule.egg-info/requires.txt +0 -0
  38. {datamule-1.0.9 → datamule-1.1.1}/datamule.egg-info/top_level.txt +0 -0
  39. {datamule-1.0.9 → datamule-1.1.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.0.9
3
+ Version: 1.1.1
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -12,7 +12,6 @@ from ..utils import headers
12
12
 
13
13
  async def download_sec_file(url, target_path):
14
14
  """Download submissions.zip from SEC website with progress bar."""
15
-
16
15
 
17
16
  async with aiohttp.ClientSession() as session:
18
17
  async with session.get(url, headers=headers) as response:
@@ -53,6 +52,9 @@ def extract_metadata(data):
53
52
  for field in ['street1', 'street2', 'city', 'stateOrCountry', 'zipCode', 'stateOrCountryDescription']:
54
53
  result[f"{addr_type}_{field}"] = addr.get(field)
55
54
 
55
+ # Add start_date field (will be populated later)
56
+ result['start_date'] = ''
57
+
56
58
  return result
57
59
 
58
60
  def extract_earliest_filing_date(data):
@@ -78,8 +80,12 @@ def extract_earliest_filing_date(data):
78
80
  return earliest_date
79
81
 
80
82
  def process_former_names(data, cik, current_name):
81
- """Process former names into a list of records."""
83
+ """
84
+ Process former names into a list of records.
85
+ Returns former names records and the earliest company date.
86
+ """
82
87
  former_names_records = []
88
+ earliest_company_date = None
83
89
 
84
90
  # Process former names if present
85
91
  former_names = data.get('formerNames', [])
@@ -98,6 +104,10 @@ def process_former_names(data, cik, current_name):
98
104
  # Clean up date formats (remove time component)
99
105
  if start_date:
100
106
  start_date = start_date.split('T')[0]
107
+ # Track earliest company date across all former names
108
+ if earliest_company_date is None or start_date < earliest_company_date:
109
+ earliest_company_date = start_date
110
+
101
111
  if end_date:
102
112
  end_date = end_date.split('T')[0]
103
113
  # Track latest end date
@@ -114,10 +124,16 @@ def process_former_names(data, cik, current_name):
114
124
 
115
125
  former_names_records.append(record)
116
126
 
127
+ # Find the earliest filing date for the company if no date found in former names
128
+ if earliest_company_date is None:
129
+ earliest_company_date = extract_earliest_filing_date(data)
130
+ if earliest_company_date and 'T' in earliest_company_date:
131
+ earliest_company_date = earliest_company_date.split('T')[0]
132
+
117
133
  # For the current name, if we don't have a start date from former names,
118
- # we'll try to find the earliest filing date
134
+ # we'll use the earliest filing date
119
135
  if not latest_end_date:
120
- latest_end_date = extract_earliest_filing_date(data)
136
+ latest_end_date = earliest_company_date
121
137
 
122
138
  # Add current name record with start date as latest end date
123
139
  current_record = {
@@ -129,7 +145,8 @@ def process_former_names(data, cik, current_name):
129
145
 
130
146
  former_names_records.append(current_record)
131
147
 
132
- return former_names_records
148
+ # Return both the records and the earliest company date (for metadata)
149
+ return former_names_records, earliest_company_date
133
150
 
134
151
  def write_metadata_to_csv(metadata_list, output_path):
135
152
  """Write metadata records to CSV and compress with gzip."""
@@ -145,8 +162,8 @@ def write_metadata_to_csv(metadata_list, output_path):
145
162
  for metadata in metadata_list:
146
163
  fieldnames.update(metadata.keys())
147
164
 
148
- # Make sure 'name' and 'cik' come first
149
- fieldnames = ['name', 'cik'] + [f for f in sorted(fieldnames) if f not in ['name', 'cik']]
165
+ # Make sure 'name', 'cik', and 'start_date' come first
166
+ fieldnames = ['name', 'cik', 'start_date'] + [f for f in sorted(fieldnames) if f not in ['name', 'cik', 'start_date']]
150
167
 
151
168
  # Write directly to gzipped CSV without using StringIO buffer
152
169
  with gzip.open(output_path, 'wt', encoding='utf-8', newline='') as gzfile:
@@ -299,7 +316,11 @@ async def extract_and_process_metadata(output_dir, local_zip_path=None, sec_url=
299
316
  name = metadata.get('name', '')
300
317
 
301
318
  # Process former names with the full json_data
302
- former_names_records = process_former_names(json_data, cik, name)
319
+ # Now also returning the earliest company date
320
+ former_names_records, earliest_company_date = process_former_names(json_data, cik, name)
321
+
322
+ # Add the earliest company date to the metadata
323
+ metadata['start_date'] = earliest_company_date if earliest_company_date else ''
303
324
 
304
325
  # Check if company is listed (has tickers)
305
326
  tickers = metadata.get('tickers', [])
@@ -6,13 +6,14 @@ from tqdm import tqdm
6
6
  from ..utils import RetryException, PreciseRateLimiter, RateMonitor, headers
7
7
 
8
8
  class EFTSQuery:
9
- def __init__(self, requests_per_second=5.0):
9
+ def __init__(self, requests_per_second=5.0, quiet=False):
10
10
  self.base_url = "https://efts.sec.gov/LATEST/search-index"
11
11
  self.headers = headers
12
12
  self.limiter = PreciseRateLimiter(requests_per_second)
13
13
  self.rate_monitor = RateMonitor()
14
14
  self.session = None
15
15
  self.pbar = None
16
+ self.quiet = quiet
16
17
  self.max_page_size = 100 # EFTS API limit
17
18
  self.fetch_queue = asyncio.Queue()
18
19
  self.connection_semaphore = asyncio.Semaphore(5) # Max 5 concurrent connections
@@ -127,6 +128,8 @@ class EFTSQuery:
127
128
  return ", ".join(parts)
128
129
 
129
130
  async def _fetch_json(self, url):
131
+ if not self.quiet:
132
+ print(f"Fetching {url}...")
130
133
  async with self.connection_semaphore:
131
134
  async with self.limiter:
132
135
  try:
@@ -160,18 +163,21 @@ class EFTSQuery:
160
163
  await callback(hits)
161
164
  self.fetch_queue.task_done()
162
165
  except RetryException as e:
163
- print(f"\nRate limited. Sleeping for {e.retry_after} seconds...")
166
+ if not self.quiet:
167
+ print(f"\nRate limited. Sleeping for {e.retry_after} seconds...")
164
168
  await asyncio.sleep(e.retry_after)
165
169
  # Put back in queue
166
170
  await self.fetch_queue.put((params, from_val, size_val, callback))
167
171
  self.fetch_queue.task_done()
168
172
  except Exception as e:
169
- print(f"\nError fetching {url}: {str(e)}")
173
+ if not self.quiet:
174
+ print(f"\nError fetching {url}: {str(e)}")
170
175
  self.fetch_queue.task_done()
171
176
  except asyncio.CancelledError:
172
177
  break
173
178
  except Exception as e:
174
- print(f"\nWorker error: {str(e)}")
179
+ if not self.quiet:
180
+ print(f"\nWorker error: {str(e)}")
175
181
  self.fetch_queue.task_done()
176
182
 
177
183
  def _split_date_range(self, start_date, end_date, num_splits=4):
@@ -322,12 +328,14 @@ class EFTSQuery:
322
328
 
323
329
  # Skip if no results
324
330
  if total_hits == 0:
325
- print(f"Skipping negated forms query - no results returned")
331
+ if not self.quiet:
332
+ print(f"Skipping negated forms query - no results returned")
326
333
  return
327
334
 
328
- query_desc = self._get_query_description(params)
329
- date_range = f"{start_date} to {end_date}"
330
- print(f"Planning: Analyzing negated forms query (depth {depth}): {date_range} [{total_hits:,} hits]")
335
+ if not self.quiet:
336
+ query_desc = self._get_query_description(params)
337
+ date_range = f"{start_date} to {end_date}"
338
+ print(f"Planning: Analyzing negated forms query (depth {depth}): {date_range} [{total_hits:,} hits]")
331
339
 
332
340
  # If small enough or at max depth, process directly
333
341
  if total_hits < self.max_efts_hits or start_date == end_date:
@@ -350,8 +358,9 @@ class EFTSQuery:
350
358
 
351
359
  total_hits, data = await self._test_query_size(params)
352
360
 
353
- query_desc = self._get_query_description(params)
354
- print(f"Planning: Analyzing {' '*depth}query: {query_desc} [{total_hits:,} hits]")
361
+ if not self.quiet:
362
+ query_desc = self._get_query_description(params)
363
+ print(f"Planning: Analyzing {' '*depth}query: {query_desc} [{total_hits:,} hits]")
355
364
 
356
365
  # If we're at the maximum recursion depth or hits are under limit, process directly
357
366
  if depth >= max_depth or total_hits < self.max_efts_hits:
@@ -396,8 +405,9 @@ class EFTSQuery:
396
405
 
397
406
  async def _start_query_phase(self, callback):
398
407
  """Start the query phase after planning is complete"""
399
- print("\n--- Starting query phase ---")
400
- self.pbar = tqdm(total=self.total_results_to_fetch, desc="Querying documents [Rate: 0/s | 0 MB/s]")
408
+ if not self.quiet:
409
+ print("\n--- Starting query phase ---")
410
+ self.pbar = tqdm(total=self.total_results_to_fetch, desc="Querying documents [Rate: 0/s | 0 MB/s]")
401
411
 
402
412
  # Queue all pending page requests
403
413
  for params, from_val, size_val, callback in self.pending_page_requests:
@@ -425,18 +435,21 @@ class EFTSQuery:
425
435
  self.pbar = None
426
436
 
427
437
  # First check size
428
- print("\n--- Starting query planning phase ---")
429
- print("Analyzing request and splitting into manageable chunks...")
438
+ if not self.quiet:
439
+ print("\n--- Starting query planning phase ---")
440
+ print("Analyzing request and splitting into manageable chunks...")
430
441
 
431
442
  total_hits, data = await self._test_query_size(params)
432
443
 
433
444
  if total_hits == 0:
434
- print("No results found for this query.")
445
+ if not self.quiet:
446
+ print("No results found for this query.")
435
447
  return []
436
448
 
437
449
  # Get accurate total from aggregation buckets
438
450
  self.true_total_docs = self._get_total_from_buckets(data)
439
- print(f"Found {self.true_total_docs:,} total documents to retrieve.")
451
+ if not self.quiet:
452
+ print(f"Found {self.true_total_docs:,} total documents to retrieve.")
440
453
 
441
454
  # Start worker tasks
442
455
  workers = [asyncio.create_task(self._fetch_worker()) for _ in range(5)]
@@ -458,7 +471,8 @@ class EFTSQuery:
458
471
  negated_forms.append('-0') # Keep primary documents constraint
459
472
 
460
473
  remaining_docs = self.true_total_docs - self.processed_doc_count
461
- print(f"Planning: Analyzing remaining primary document forms using negation (~{remaining_docs:,} hits)")
474
+ if not self.quiet:
475
+ print(f"Planning: Analyzing remaining primary document forms using negation (~{remaining_docs:,} hits)")
462
476
 
463
477
  # Process negated forms query with recursive date splitting
464
478
  start_date = params['startdt']
@@ -466,9 +480,9 @@ class EFTSQuery:
466
480
  await self._process_negated_forms_recursive(
467
481
  params, negated_forms, start_date, end_date, 0, collect_hits
468
482
  )
469
- else:
483
+ elif not self.quiet:
470
484
  print("No additional forms to process with negation - not a primary documents query")
471
- else:
485
+ elif not self.quiet:
472
486
  print("No additional forms to process with negation")
473
487
 
474
488
  # Start the download phase
@@ -488,15 +502,16 @@ class EFTSQuery:
488
502
  self.pbar.close()
489
503
  self.pbar = None
490
504
 
491
- print(f"\n--- Query complete: {len(all_hits):,} submissions retrieved ---")
505
+ if not self.quiet:
506
+ print(f"\n--- Query complete: {len(all_hits):,} submissions retrieved ---")
492
507
  return all_hits
493
508
 
494
- def query_efts(cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, callback=None):
509
+ def query_efts(cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, callback=None, quiet=False):
495
510
  """
496
511
  Convenience function to run a query without managing the async context.
497
512
  """
498
513
  async def run_query():
499
- query = EFTSQuery(requests_per_second=requests_per_second)
514
+ query = EFTSQuery(requests_per_second=requests_per_second, quiet=quiet)
500
515
  return await query.query(cik, submission_type, filing_date, callback)
501
516
 
502
517
  return asyncio.run(run_query())
@@ -9,8 +9,8 @@ class TextSearchEFTSQuery(EFTSQuery):
9
9
  """
10
10
  Extended EFTSQuery class that adds text search capabilities.
11
11
  """
12
- def __init__(self, text_query, requests_per_second=5.0):
13
- super().__init__(requests_per_second=requests_per_second)
12
+ def __init__(self, text_query, requests_per_second=5.0, quiet=False):
13
+ super().__init__(requests_per_second=requests_per_second, quiet=quiet)
14
14
  self.text_query = text_query
15
15
 
16
16
  def _prepare_params(self, cik=None, submission_type=None, filing_date=None):
@@ -46,7 +46,7 @@ async def extract_accession_numbers(hits):
46
46
  accession_numbers.append(acc_no)
47
47
  return accession_numbers
48
48
 
49
- def query(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0):
49
+ def query(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, quiet=False):
50
50
  """
51
51
  Search SEC filings for text and return the full search results.
52
52
 
@@ -66,6 +66,8 @@ def query(text_query, cik=None, submission_type=None, filing_date=None, requests
66
66
  requests_per_second : float, optional
67
67
  Maximum number of requests per second to make to the SEC API.
68
68
  Default is 5.0.
69
+ quiet : bool, optional
70
+ If True, suppresses all output (progress bars and prints). Default is False.
69
71
 
70
72
  Returns:
71
73
  --------
@@ -73,12 +75,12 @@ def query(text_query, cik=None, submission_type=None, filing_date=None, requests
73
75
  Complete search results with all hit data.
74
76
  """
75
77
  async def run_query():
76
- query = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second)
78
+ query = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second, quiet=quiet)
77
79
  return await query.query(cik, submission_type, filing_date)
78
80
 
79
81
  return asyncio.run(run_query())
80
82
 
81
- def filter_text(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0):
83
+ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, quiet=False):
82
84
  """
83
85
  Search SEC filings for text and return matching accession numbers.
84
86
 
@@ -98,6 +100,8 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
98
100
  requests_per_second : float, optional
99
101
  Maximum number of requests per second to make to the SEC API.
100
102
  Default is 5.0.
103
+ quiet : bool, optional
104
+ If True, suppresses all output (progress bars and prints). Default is False.
101
105
 
102
106
  Returns:
103
107
  --------
@@ -105,7 +109,7 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
105
109
  List of accession numbers (as strings) for filings that match the text query.
106
110
  """
107
111
  async def run_query():
108
- query_obj = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second)
112
+ query_obj = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second, quiet=quiet)
109
113
 
110
114
  # Create a collector for accession numbers
111
115
  all_acc_nos = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.0.9
3
+ Version: 1.1.1
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -29,7 +29,7 @@ if not file_path.exists():
29
29
  setup(
30
30
  name="datamule",
31
31
  author="John Friedman",
32
- version="1.0.9",
32
+ version="1.1.1",
33
33
  description="Making it easier to use SEC filings.",
34
34
  packages=find_packages(include=['datamule', 'datamule.*']),
35
35
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes