datamule 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/document.py CHANGED
@@ -8,31 +8,34 @@ from .mapping_dicts.xml_mapping_dicts import dict_345
8
8
  from selectolax.parser import HTMLParser
9
9
 
10
10
  class Document:
11
- def __init__(self, type, filename):
11
+ def __init__(self, type, content, extension):
12
+
12
13
  self.type = type
13
- self.path = filename
14
+ # we will remove this later #
15
+ # make sure extension is in lower case
16
+ extension = extension.lower()
17
+ self.content = content
18
+ if extension == '.txt':
19
+ self.content = self._preprocess_txt_content()
20
+ elif extension in ['.htm', '.html']:
21
+ self.content = self._preprocess_html_content()
14
22
 
23
+ self.extension = extension
24
+ # this will be filled by parsed
15
25
  self.data = None
16
- self.content = None
17
-
18
26
 
19
- def load_content(self,encoding='utf-8'):
20
- with open(self.path, 'r',encoding=encoding) as f:
21
- self.content = f.read()
22
-
23
- def _load_text_content(self):
24
- with open(self.path) as f:
25
- return f.read().translate(str.maketrans({
27
+ #_load_text_content
28
+ def _preprocess_txt_content(self):
29
+ return self.content.read().translate(str.maketrans({
26
30
  '\xa0': ' ', '\u2003': ' ',
27
31
  '\u2018': "'", '\u2019': "'",
28
32
  '\u201c': '"', '\u201d': '"'
29
33
  }))
30
34
 
31
35
  # will deprecate this when we add html2dict
32
- def _load_html_content(self):
33
- with open(self.path,'rb') as f:
34
- parser = HTMLParser(f.read(),detect_encoding=True,decode_errors='ignore')
35
-
36
+ def _preprocess_html_content(self):
37
+ parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
38
+
36
39
  # Remove hidden elements first
37
40
  hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
38
41
  for node in hidden_nodes:
@@ -83,20 +86,9 @@ class Document:
83
86
  '\u201c': '"', '\u201d': '"'
84
87
  }))
85
88
 
86
- def _load_file_content(self):
87
- if self.path.suffix =='.txt':
88
- self.content = self._load_text_content()
89
- elif self.path.suffix in ['.html','.htm']:
90
- self.content = self._load_html_content()
91
- else:
92
- raise ValueError(f"Unsupported file type: {self.path.suffix}")
93
-
94
-
95
89
  def contains_string(self, pattern):
96
- """Currently only works for .htm, .html, and .txt files"""
97
- if self.path.suffix in ['.htm', '.html', '.txt']:
98
- if self.content is None:
99
- self.content = self._load_file_content(self.path)
90
+ """Works for select files"""
91
+ if self.extension in ['.htm', '.html', '.txt','.xml']:
100
92
  return bool(re.search(pattern, self.content))
101
93
  return False
102
94
 
@@ -104,15 +96,14 @@ class Document:
104
96
  def parse(self):
105
97
  mapping_dict = None
106
98
 
107
- if self.path.suffix == '.xml':
99
+ if self.extension == '.xml':
108
100
  if self.type in ['3', '4', '5']:
109
101
  mapping_dict = dict_345
110
102
 
111
- self.load_content()
112
103
  self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
104
+
113
105
  # will deprecate this when we add html2dict
114
- elif self.path.suffix in ['.htm', '.html','.txt']:
115
- self._load_file_content()
106
+ elif self.extension in ['.htm', '.html','.txt']:
116
107
 
117
108
  if self.type == '10-K':
118
109
  mapping_dict = dict_10k
@@ -133,18 +124,12 @@ class Document:
133
124
  if not self.data:
134
125
  self.parse()
135
126
 
136
- if output_filename is None:
137
- output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
138
-
139
127
  with open(output_filename, 'w',encoding='utf-8') as f:
140
128
  json.dump(self.data, f, indent=2)
141
129
 
142
130
  def write_csv(self, output_filename=None, accession_number=None):
143
131
  self.parse()
144
132
 
145
- if output_filename is None:
146
- output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
147
-
148
133
  with open(output_filename, 'w', newline='') as csvfile:
149
134
  if not self.data:
150
135
  return output_filename
@@ -165,7 +150,7 @@ class Document:
165
150
  writer.writeheader()
166
151
  for row in self.data:
167
152
  if accession_number:
168
- row['Accession Number'] = convert_to_dashed_accession(accession_number)
153
+ row['Accession Number'] = accession_number
169
154
  writer.writerow(row)
170
155
 
171
156
  return output_filename
@@ -225,7 +210,7 @@ class Document:
225
210
  # Let's remove XML iterable for now
226
211
 
227
212
  # Handle text-based documents
228
- if self.path.suffix in ['.txt', '.htm', '.html']:
213
+ if self.extension in ['.txt', '.htm', '.html']:
229
214
  document_data = self.data
230
215
  if not document_data:
231
216
  return iter([])
@@ -235,13 +220,13 @@ class Document:
235
220
  section_type = None
236
221
 
237
222
  if self.type in ['10-K', '10-Q']:
238
- mapping_dict = txt_mapping_dicts.dict_10k if self.type == '10-K' else txt_mapping_dicts.dict_10q
223
+ mapping_dict = dict_10k if self.type == '10-K' else dict_10q
239
224
  elif self.type == '8-K':
240
- mapping_dict = txt_mapping_dicts.dict_8k
225
+ mapping_dict = dict_8k
241
226
  elif self.type == 'SC 13D':
242
- mapping_dict = txt_mapping_dicts.dict_13d
227
+ mapping_dict = dict_13d
243
228
  elif self.type == 'SC 13G':
244
- mapping_dict = txt_mapping_dicts.dict_13g
229
+ mapping_dict = dict_13g
245
230
  else:
246
231
  return iter([])
247
232
 
datamule/portfolio.py CHANGED
@@ -142,7 +142,7 @@ class Portfolio:
142
142
  cik=cik,
143
143
  submission_type=submission_type,
144
144
  filing_date=filing_date,
145
- requests_per_second=5, # Revisit this later.
145
+ requests_per_second=5,
146
146
  accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
147
147
  )
148
148
 
@@ -179,4 +179,8 @@ class Portfolio:
179
179
  document_types = [document_types]
180
180
 
181
181
  for submission in self.submissions:
182
- yield from submission.document_type(document_types)
182
+ yield from submission.document_type(document_types)
183
+
184
+ def keep(self,document_type):
185
+ for submission in self.__iter__():
186
+ submission.keep(document_type)
@@ -36,7 +36,8 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings")
36
36
  print(f"Error processing {accno}: {e}")
37
37
  return None
38
38
 
39
- def download(cik=None, submission_type=None, filing_date=None, requests_per_second=5, output_dir="filings", accession_numbers=None):
39
+ def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
40
+ requests_per_second=5, output_dir="filings", accession_numbers=None, quiet=False):
40
41
  """
41
42
  Download SEC EDGAR filings and extract their documents.
42
43
 
@@ -44,12 +45,25 @@ def download(cik=None, submission_type=None, filing_date=None, requests_per_seco
44
45
  - cik: CIK number(s) to query for
45
46
  - submission_type: Filing type(s) to query for (default: 10-K)
46
47
  - filing_date: Date or date range to query for
48
+ - location: Location code to filter by (e.g., 'CA' for California)
49
+ - name: Company name to search for (alternative to providing CIK)
47
50
  - requests_per_second: Rate limit for SEC requests
48
51
  - output_dir: Directory to save documents
49
52
  - accession_numbers: Optional list of accession numbers to filter by
53
+ - quiet: Whether to suppress progress output
50
54
 
51
55
  Returns:
52
56
  - List of all document paths processed
57
+
58
+ Examples:
59
+ # Download filings by CIK
60
+ download(cik="1318605", submission_type="10-K")
61
+
62
+ # Download filings by company name
63
+ download(name="Tesla", submission_type="10-K")
64
+
65
+ # Download filings with location filter
66
+ download(name="Apple", location="CA", submission_type="10-K")
53
67
  """
54
68
 
55
69
  # Make sure output directory exists
@@ -62,9 +76,12 @@ def download(cik=None, submission_type=None, filing_date=None, requests_per_seco
62
76
  # Call the stream function with our callback
63
77
  return stream(
64
78
  cik=cik,
79
+ name=name,
65
80
  submission_type=submission_type,
66
81
  filing_date=filing_date,
82
+ location=location,
67
83
  requests_per_second=requests_per_second,
68
84
  document_callback=callback_wrapper,
69
- accession_numbers=accession_numbers
85
+ accession_numbers=accession_numbers,
86
+ quiet=quiet
70
87
  )
@@ -42,6 +42,67 @@ class EFTSQuery:
42
42
  await self.session.close()
43
43
  self.session = None
44
44
 
45
+ async def search_name(self, name):
46
+ """
47
+ Search for companies by name using the EFTS name search endpoint.
48
+
49
+ Parameters:
50
+ name (str): Company name to search for
51
+
52
+ Returns:
53
+ list: List of dictionaries containing company information (entity, id, tickers if available)
54
+ """
55
+ if not self.session:
56
+ raise RuntimeError("No active session. This method must be called within an async context.")
57
+
58
+ url = f"{self.base_url}?keysTyped={name}"
59
+
60
+ if not self.quiet:
61
+ print(f"Searching for company: {name}")
62
+
63
+ async with self.limiter:
64
+ try:
65
+ async with self.session.get(url) as response:
66
+ if response.status == 429:
67
+ raise RetryException(url)
68
+ response.raise_for_status()
69
+ content = await response.read()
70
+ await self.rate_monitor.add_request(len(content))
71
+ data = await response.json()
72
+
73
+ if 'hits' in data and 'hits' in data['hits']:
74
+ hits = data['hits']['hits']
75
+ results = []
76
+
77
+ for hit in hits:
78
+ source = hit.get('_source', {})
79
+ result = {
80
+ 'entity': source.get('entity', ''),
81
+ 'id': hit.get('_id', ''),
82
+ 'tickers': source.get('tickers', '')
83
+ }
84
+ results.append(result)
85
+
86
+ if not self.quiet and results:
87
+ # Create a compact display of results
88
+ display_results = [f"{r['entity']} [{r['id']}]" for r in results]
89
+ print(f"Name matches: {', '.join(display_results[:5])}")
90
+ if len(results) > 5:
91
+ print(f"...and {len(results) - 5} more matches")
92
+
93
+ return results
94
+ return []
95
+ except aiohttp.ClientResponseError as e:
96
+ if e.status == 429:
97
+ raise RetryException(url)
98
+ if not self.quiet:
99
+ print(f"Error searching for company: {str(e)}")
100
+ return []
101
+ except Exception as e:
102
+ if not self.quiet:
103
+ print(f"Error searching for company: {str(e)}")
104
+ return []
105
+
45
106
  def _get_form_exclusions(self, form):
46
107
  """Dynamically generate form exclusions based on patterns"""
47
108
  # Skip already negated forms
@@ -55,7 +116,7 @@ class EFTSQuery:
55
116
  # No exclusions for amendment forms
56
117
  return []
57
118
 
58
- def _prepare_params(self, cik=None, submission_type=None, filing_date=None):
119
+ def _prepare_params(self, cik=None, submission_type=None, filing_date=None, location=None):
59
120
  params = {}
60
121
 
61
122
  # Handle CIK
@@ -111,6 +172,10 @@ class EFTSQuery:
111
172
  params['startdt'] = "2001-01-01"
112
173
  params['enddt'] = datetime.now().strftime('%Y-%m-%d')
113
174
 
175
+ # Handle location filtering
176
+ if location:
177
+ params['filter_location'] = location
178
+
114
179
  return params
115
180
 
116
181
  def _get_query_description(self, params):
@@ -125,6 +190,9 @@ class EFTSQuery:
125
190
  if 'startdt' in params and 'enddt' in params:
126
191
  parts.append(f"dates={params['startdt']} to {params['enddt']}")
127
192
 
193
+ if 'filter_location' in params:
194
+ parts.append(f"location={params['filter_location']}")
195
+
128
196
  return ", ".join(parts)
129
197
 
130
198
  async def _fetch_json(self, url):
@@ -413,12 +481,26 @@ class EFTSQuery:
413
481
  for params, from_val, size_val, callback in self.pending_page_requests:
414
482
  await self.fetch_queue.put((params, from_val, size_val, callback))
415
483
 
416
- async def query(self, cik=None, submission_type=None, filing_date=None, callback=None):
417
- params = self._prepare_params(cik, submission_type, filing_date)
418
- all_hits = []
484
+ async def query(self, cik=None, submission_type=None, filing_date=None, location=None, callback=None, name=None):
485
+ """
486
+ Query SEC filings using the EFTS API.
487
+
488
+ Parameters:
489
+ cik (str or list): Central Index Key(s) for the company
490
+ submission_type (str or list): Filing form type(s) to filter by
491
+ filing_date (str, tuple, or list): Date or date range to filter by
492
+ location (str): Location code to filter by (e.g., 'CA' for California)
493
+ callback (function): Async callback function to process results as they arrive
494
+ name (str): Company name to search for (alternative to providing CIK)
419
495
 
420
- # Check if this is a primary documents query
421
- self.was_primary_docs_query = '-0' in params.get('forms', '').split(',')
496
+ Returns:
497
+ list: List of filing documents matching the query criteria
498
+ """
499
+ # If both CIK and name are provided, raise an error
500
+ if cik is not None and name is not None:
501
+ raise ValueError("Please provide either 'name' or 'cik', not both")
502
+
503
+ all_hits = []
422
504
 
423
505
  # Collector callback to gather all hits
424
506
  async def collect_hits(hits):
@@ -427,6 +509,25 @@ class EFTSQuery:
427
509
  await callback(hits)
428
510
 
429
511
  async with self as client:
512
+ # If name is provided, search for matching companies inside the context manager
513
+ if name is not None:
514
+ company_results = await self.search_name(name)
515
+ if not company_results:
516
+ if not self.quiet:
517
+ print(f"No companies found matching: {name}")
518
+ return []
519
+
520
+ # Use the first (best) match's CIK
521
+ cik = company_results[0]['id']
522
+ if not self.quiet:
523
+ print(f"Using CIK {cik} for {company_results[0]['entity']}")
524
+
525
+ # Now prepare parameters with the CIK (either provided directly or from name search)
526
+ params = self._prepare_params(cik, submission_type, filing_date, location)
527
+
528
+ # Check if this is a primary documents query
529
+ self.was_primary_docs_query = '-0' in params.get('forms', '').split(',')
530
+
430
531
  # Reset state for new query
431
532
  self.total_results_to_fetch = 0
432
533
  self.pending_page_requests = []
@@ -506,12 +607,32 @@ class EFTSQuery:
506
607
  print(f"\n--- Query complete: {len(all_hits):,} submissions retrieved ---")
507
608
  return all_hits
508
609
 
509
- def query_efts(cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, callback=None, quiet=False):
610
+ def query_efts(cik=None, submission_type=None, filing_date=None, location=None, requests_per_second=5.0, callback=None, quiet=False, name=None):
510
611
  """
511
612
  Convenience function to run a query without managing the async context.
613
+
614
+ Parameters:
615
+ cik (str or list): Central Index Key(s) for the company
616
+ submission_type (str or list): Filing form type(s) to filter by
617
+ filing_date (str, tuple, or list): Date or date range to filter by
618
+ location (str): Location code to filter by (e.g., 'CA' for California)
619
+ requests_per_second (float): Maximum requests per second to make to the SEC API
620
+ callback (function): Async callback function to process results as they arrive
621
+ quiet (bool): Whether to suppress progress output
622
+ name (str): Company name to search for (alternative to providing CIK)
623
+
624
+ Returns:
625
+ list: List of filing documents matching the query criteria
626
+
627
+ Example:
628
+ To search by company name:
629
+ results = query_efts(name="Tesla", submission_type="10-K")
630
+
631
+ To search by CIK:
632
+ results = query_efts(cik="1318605", submission_type="10-K")
512
633
  """
513
634
  async def run_query():
514
635
  query = EFTSQuery(requests_per_second=requests_per_second, quiet=quiet)
515
- return await query.query(cik, submission_type, filing_date, callback)
636
+ return await query.query(cik, submission_type, filing_date, location, callback, name)
516
637
 
517
638
  return asyncio.run(run_query())
@@ -5,7 +5,7 @@ from ..rss.monitor import start_monitor # Import start_monitor directly
5
5
  import pytz
6
6
 
7
7
 
8
- async def _process_efts_hits(hits, collected_accession_numbers, data_callback=None):
8
+ async def _process_efts_hits(hits, collected_accession_numbers, data_callback=None,rate_limiter=None):
9
9
  """Process EFTS hits, collect accession numbers, and call data callback."""
10
10
  processed_hits = []
11
11
 
@@ -36,7 +36,7 @@ async def _process_efts_hits(hits, collected_accession_numbers, data_callback=No
36
36
 
37
37
  # Call data callback if provided
38
38
  if data_callback and processed_hits:
39
- await data_callback(processed_hits)
39
+ await data_callback(processed_hits, rate_limiter)
40
40
 
41
41
  return processed_hits
42
42
 
@@ -61,7 +61,7 @@ async def _master_monitor_impl(data_callback=None, poll_callback=None, submissio
61
61
 
62
62
  # Prepare a wrapper callback to collect accession numbers
63
63
  async def process_callback(hits):
64
- await _process_efts_hits(hits, collected_accession_numbers, data_callback)
64
+ await _process_efts_hits(hits, collected_accession_numbers, data_callback, efts_query.limiter)
65
65
 
66
66
  # Create an EFTSQuery instance
67
67
  efts_query = EFTSQuery(requests_per_second=requests_per_second)
@@ -21,8 +21,8 @@ def fix_filing_url(url):
21
21
  return url
22
22
 
23
23
  class Streamer(EFTSQuery):
24
- def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None):
25
- super().__init__(requests_per_second=requests_per_second)
24
+ def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None, quiet=False):
25
+ super().__init__(requests_per_second=requests_per_second, quiet=quiet)
26
26
  self.document_callback = document_callback
27
27
  self.document_queue = asyncio.Queue()
28
28
  self.download_in_progress = asyncio.Event()
@@ -57,12 +57,14 @@ class Streamer(EFTSQuery):
57
57
  await callback(hits)
58
58
  self.fetch_queue.task_done()
59
59
  except Exception as e:
60
- print(f"\nError fetching {url}: {str(e)}")
60
+ if not self.quiet:
61
+ print(f"\nError fetching {url}: {str(e)}")
61
62
  self.fetch_queue.task_done()
62
63
  except asyncio.CancelledError:
63
64
  break
64
65
  except Exception as e:
65
- print(f"\nWorker error: {str(e)}")
66
+ if not self.quiet:
67
+ print(f"\nWorker error: {str(e)}")
66
68
  self.fetch_queue.task_done()
67
69
 
68
70
  def _construct_submission_url(self, hit):
@@ -85,7 +87,8 @@ class Streamer(EFTSQuery):
85
87
 
86
88
  return url, cik, accno_w_dash
87
89
  except (KeyError, IndexError) as e:
88
- print(f"Error constructing URL for hit: {hit}. Error: {str(e)}")
90
+ if not self.quiet:
91
+ print(f"Error constructing URL for hit: {hit}. Error: {str(e)}")
89
92
  return None, None, None
90
93
 
91
94
  async def _document_download_worker(self):
@@ -115,13 +118,15 @@ class Streamer(EFTSQuery):
115
118
 
116
119
  self.document_queue.task_done()
117
120
  except Exception as e:
118
- print(f"\nError streaming document {doc_url}: {str(e)}")
121
+ if not self.quiet:
122
+ print(f"\nError streaming document {doc_url}: {str(e)}")
119
123
  self.document_queue.task_done()
120
124
 
121
125
  except asyncio.CancelledError:
122
126
  break
123
127
  except Exception as e:
124
- print(f"\nDocument worker error: {str(e)}")
128
+ if not self.quiet:
129
+ print(f"\nDocument worker error: {str(e)}")
125
130
  self.document_queue.task_done()
126
131
 
127
132
  async def document_download_callback(self, hits):
@@ -133,7 +138,7 @@ class Streamer(EFTSQuery):
133
138
  self.download_in_progress.set()
134
139
 
135
140
  # Create progress bar for documents if not exists
136
- if not self.document_pbar:
141
+ if not self.document_pbar and not self.quiet:
137
142
  self.document_pbar = tqdm(total=0, desc="Streaming submissions")
138
143
 
139
144
  # Queue up the documents for download
@@ -141,7 +146,8 @@ class Streamer(EFTSQuery):
141
146
  doc_url, cik, accno = self._construct_submission_url(hit)
142
147
  if doc_url:
143
148
  # Update document progress bar total
144
- self.document_pbar.total += 1
149
+ if self.document_pbar:
150
+ self.document_pbar.total += 1
145
151
  self.total_documents += 1
146
152
 
147
153
  # Add to download queue
@@ -159,8 +165,20 @@ class Streamer(EFTSQuery):
159
165
  # Signal that document download is complete
160
166
  self.download_in_progress.clear()
161
167
 
162
- async def stream(self, cik=None, submission_type=None, filing_date=None):
163
- """Main method to stream EFTS results and download documents"""
168
+ async def stream(self, cik=None, submission_type=None, filing_date=None, location=None, name=None):
169
+ """
170
+ Main method to stream EFTS results and download documents
171
+
172
+ Parameters:
173
+ cik (str or list): Central Index Key(s) for the company
174
+ submission_type (str or list): Filing form type(s) to filter by
175
+ filing_date (str, tuple, or list): Date or date range to filter by
176
+ location (str): Location code to filter by (e.g., 'CA' for California)
177
+ name (str): Company name to search for (alternative to providing CIK)
178
+
179
+ Returns:
180
+ list: List of all EFTS hits processed
181
+ """
164
182
  # Create document worker tasks
165
183
  self.document_workers = [
166
184
  asyncio.create_task(self._document_download_worker())
@@ -173,11 +191,12 @@ class Streamer(EFTSQuery):
173
191
  self.skipped_documents = 0
174
192
 
175
193
  # Run the main query with our document download callback
176
- results = await self.query(cik, submission_type, filing_date, self.document_download_callback)
194
+ results = await self.query(cik, submission_type, filing_date, location, self.document_download_callback, name)
177
195
 
178
196
  # Make sure all document downloads are complete
179
197
  if self.download_in_progress.is_set():
180
- print("Waiting for remaining document downloads to complete...")
198
+ if not self.quiet:
199
+ print("Waiting for remaining document downloads to complete...")
181
200
  await self.document_queue.join()
182
201
 
183
202
  # Clean up document workers
@@ -190,14 +209,17 @@ class Streamer(EFTSQuery):
190
209
  if self.document_pbar:
191
210
  self.document_pbar.close()
192
211
  self.document_pbar = None # Set to None to prevent reuse
193
-
194
- print(f"\n--- Streaming complete: {len(results)} EFTS results processed ---")
195
- if self.accession_numbers is not None:
196
- print(f"--- {self.documents_processed} documents downloaded, {self.skipped_documents} skipped due to accession number filter ---")
212
+
213
+ if not self.quiet:
214
+ print(f"\n--- Streaming complete: {len(results)} EFTS results processed ---")
215
+ if self.accession_numbers is not None:
216
+ print(f"--- {self.documents_processed} documents downloaded, {self.skipped_documents} skipped due to accession number filter ---")
217
+
197
218
  return results
198
219
 
199
- def stream(cik=None, submission_type=None, filing_date=None,
200
- requests_per_second=5.0, document_callback=None, accession_numbers=None):
220
+ def stream(cik=None, submission_type=None, filing_date=None, location=None,
221
+ requests_per_second=5.0, document_callback=None, accession_numbers=None,
222
+ quiet=False, name=None):
201
223
  """
202
224
  Stream EFTS results and download documents into memory.
203
225
 
@@ -205,15 +227,28 @@ def stream(cik=None, submission_type=None, filing_date=None,
205
227
  - cik: CIK number(s) to query for
206
228
  - submission_type: Filing type(s) to query for
207
229
  - filing_date: Date or date range to query for
230
+ - location: Location code to filter by (e.g., 'CA' for California)
208
231
  - requests_per_second: Rate limit for SEC requests (combined EFTS and document downloads)
209
232
  - document_callback: Callback function that receives (hit, content, cik, accno, url)
210
233
  - accession_numbers: Optional list of accession numbers to filter by
234
+ - quiet: Whether to suppress progress output
235
+ - name: Company name to search for (alternative to providing CIK)
211
236
 
212
237
  Returns:
213
238
  - List of all EFTS hits processed
239
+
240
+ Example:
241
+ To search by company name:
242
+ results = stream(name="Tesla", submission_type="10-K")
243
+
244
+ To search by CIK:
245
+ results = stream(cik="1318605", submission_type="10-K")
246
+
247
+ To search with location filter:
248
+ results = stream(name="Tesla", location="CA", submission_type="10-K")
214
249
  """
215
-
216
- # check if acc no is empty list
250
+
251
+ # Check if acc no is empty list
217
252
  if accession_numbers == []:
218
253
  raise ValueError("Applied filter resulted in empty accession numbers list")
219
254
 
@@ -221,8 +256,9 @@ def stream(cik=None, submission_type=None, filing_date=None,
221
256
  streamer = Streamer(
222
257
  requests_per_second=requests_per_second,
223
258
  document_callback=document_callback,
224
- accession_numbers=accession_numbers
259
+ accession_numbers=accession_numbers,
260
+ quiet=quiet
225
261
  )
226
- return await streamer.stream(cik, submission_type, filing_date)
262
+ return await streamer.stream(cik, submission_type, filing_date, location, name)
227
263
 
228
264
  return asyncio.run(run_stream())
@@ -13,9 +13,9 @@ class TextSearchEFTSQuery(EFTSQuery):
13
13
  super().__init__(requests_per_second=requests_per_second, quiet=quiet)
14
14
  self.text_query = text_query
15
15
 
16
- def _prepare_params(self, cik=None, submission_type=None, filing_date=None):
16
+ def _prepare_params(self, cik=None, submission_type=None, filing_date=None, location=None):
17
17
  # Get base parameters from parent class
18
- params = super()._prepare_params(cik, submission_type, filing_date)
18
+ params = super()._prepare_params(cik, submission_type, filing_date, location)
19
19
 
20
20
  # Add text query parameter
21
21
  params['q'] = self.text_query
@@ -46,7 +46,8 @@ async def extract_accession_numbers(hits):
46
46
  accession_numbers.append(acc_no)
47
47
  return accession_numbers
48
48
 
49
- def query(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, quiet=False):
49
+ def query(text_query, cik=None, submission_type=None, filing_date=None, location=None,
50
+ name=None, requests_per_second=5.0, quiet=False):
50
51
  """
51
52
  Search SEC filings for text and return the full search results.
52
53
 
@@ -63,6 +64,10 @@ def query(text_query, cik=None, submission_type=None, filing_date=None, requests
63
64
  filing_date : str, tuple, list, optional
64
65
  Date or date range to filter by. Can be a single date string ('YYYY-MM-DD'),
65
66
  a tuple of (start_date, end_date), or a list of dates.
67
+ location : str, optional
68
+ Location code to filter by (e.g., 'CA' for California).
69
+ name : str, optional
70
+ Company name to search for (alternative to providing CIK).
66
71
  requests_per_second : float, optional
67
72
  Maximum number of requests per second to make to the SEC API.
68
73
  Default is 5.0.
@@ -73,14 +78,23 @@ def query(text_query, cik=None, submission_type=None, filing_date=None, requests
73
78
  --------
74
79
  list
75
80
  Complete search results with all hit data.
81
+
82
+ Examples:
83
+ ---------
84
+ # Search for 'climate risk' in Tesla's 10-K filings using company name
85
+ results = query('"climate risk"', name='Tesla', submission_type='10-K')
86
+
87
+ # Search for 'pandemic' in California companies' filings
88
+ results = query('pandemic', location='CA', submission_type='8-K')
76
89
  """
77
90
  async def run_query():
78
91
  query = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second, quiet=quiet)
79
- return await query.query(cik, submission_type, filing_date)
92
+ return await query.query(cik, submission_type, filing_date, location, None, name)
80
93
 
81
94
  return asyncio.run(run_query())
82
95
 
83
- def filter_text(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, quiet=False):
96
+ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, location=None,
97
+ name=None, requests_per_second=5.0, quiet=False):
84
98
  """
85
99
  Search SEC filings for text and return matching accession numbers.
86
100
 
@@ -97,6 +111,10 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
97
111
  filing_date : str, tuple, list, optional
98
112
  Date or date range to filter by. Can be a single date string ('YYYY-MM-DD'),
99
113
  a tuple of (start_date, end_date), or a list of dates.
114
+ location : str, optional
115
+ Location code to filter by (e.g., 'CA' for California).
116
+ name : str, optional
117
+ Company name to search for (alternative to providing CIK).
100
118
  requests_per_second : float, optional
101
119
  Maximum number of requests per second to make to the SEC API.
102
120
  Default is 5.0.
@@ -107,6 +125,15 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
107
125
  --------
108
126
  list
109
127
  List of accession numbers (as strings) for filings that match the text query.
128
+
129
+ Examples:
130
+ ---------
131
+ # Get accession numbers of Apple filings mentioning 'supply chain'
132
+ acc_numbers = filter_text('"supply chain"', name='Apple')
133
+
134
+ # Use the accession numbers as a filter in another API
135
+ from .downloader import download
136
+ download(name='Apple', accession_numbers=acc_numbers)
110
137
  """
111
138
  async def run_query():
112
139
  query_obj = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second, quiet=quiet)
@@ -119,7 +146,7 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
119
146
  all_acc_nos.extend(acc_nos)
120
147
 
121
148
  # Run the query with our callback
122
- await query_obj.query(cik, submission_type, filing_date, collect_acc_nos)
149
+ await query_obj.query(cik, submission_type, filing_date, location, collect_acc_nos, name)
123
150
 
124
151
  return all_acc_nos
125
152
 
datamule/sheet.py CHANGED
@@ -30,5 +30,12 @@ class Sheet:
30
30
  pass
31
31
  def query_xbrl():
32
32
  pass
33
- def query_13fhr():
33
+
34
+ # LIST TUPLE SYNTAX, so e.g. value (0,100) is 0-100, while [0,100] is 0 and 100
35
+ def get_13fhr(reportingOwnerCIK,nameOfIssuer,titleOfClass,cusip,value,
36
+ shrsOrPrnAmt_sshPrnamt,shrsOrPrnAmt_sshPrnamtType,investmentDiscretion,otherManager,
37
+ votingAuthority_Sole,
38
+ votingAuthority_Shared,
39
+ votingAuthority_None,
40
+ filing_date):
34
41
  pass
datamule/submission.py CHANGED
@@ -1,16 +1,37 @@
1
1
  from pathlib import Path
2
2
  import json
3
3
  from .document import Document
4
+ from secsgml import parse_sgml_submission_into_memory
5
+ from pathlib import Path
4
6
 
5
7
  class Submission:
6
- def __init__(self, path):
7
- self.path = Path(path)
8
- self._load_metadata()
8
+ def __init__(self, path=None,sgml_content=None,keep_document_types=None):
9
+ if path is None and sgml_content is None:
10
+ raise ValueError("Either path or sgml_content must be provided")
11
+ if path is not None and sgml_content is not None:
12
+ raise ValueError("Only one of path or sgml_content must be provided")
13
+
14
+ if sgml_content is not None:
15
+ self.path = None
16
+ self.metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
17
+
18
+ for idx,doc in enumerate(self.metadata['documents']):
19
+ type = doc.get('type')
20
+
21
+ # Keep only specified types
22
+ if keep_document_types is not None and type not in keep_document_types:
23
+ continue
24
+ filename = doc.get('filename')
25
+ extension = Path(filename).suffix
26
+ self.documents = [Document(type=type, content=raw_documents[idx], extension=extension)]
27
+
28
+
29
+ if path is not None:
30
+ self.path = Path(path)
31
+ metadata_path = self.path / 'metadata.json'
32
+ with metadata_path.open('r') as f:
33
+ self.metadata = json.load(f)
9
34
 
10
- def _load_metadata(self):
11
- metadata_path = self.path / 'metadata.json'
12
- with metadata_path.open('r') as f:
13
- self.metadata = json.load(f)
14
35
 
15
36
  def document_type(self, document_type):
16
37
  # Convert single document type to list for consistent handling
@@ -19,20 +40,73 @@ class Submission:
19
40
  else:
20
41
  document_types = document_type
21
42
 
22
- for doc in self.metadata['documents']:
43
+ for idx,doc in enumerate(self.metadata['documents']):
23
44
  if doc['type'] in document_types:
45
+
46
+ # if loaded from path
47
+ if self.path is not None:
48
+ filename = doc.get('filename')
49
+ # oh we need handling here for sequences case
50
+ if filename is None:
51
+ filename = doc['sequence'] + '.txt'
52
+
53
+ document_path = self.path / filename
54
+ extension = document_path.suffix
55
+
56
+ with document_path.open('r') as f:
57
+ content = f.read()
58
+
59
+ yield Document(type=doc['type'], content=content, extension=extension)
60
+ # if loaded from sgml_content
61
+ else:
62
+ yield self.documents[idx]
63
+
64
+
65
+ def __iter__(self):
66
+ for idx,doc in enumerate(self.metadata['documents']):
67
+ # if loaded from path
68
+ if self.path is not None:
24
69
  filename = doc.get('filename')
70
+
71
+ # oh we need handling here for sequences case
25
72
  if filename is None:
26
- continue
73
+ filename = doc['sequence'] + '.txt'
27
74
 
28
75
  document_path = self.path / filename
29
- yield Document(doc['type'], document_path)
30
-
31
- def __iter__(self):
32
- for doc in self.metadata['documents']:
33
- filename = doc.get('filename')
34
- if filename is None:
35
- continue
36
-
37
- document_path = self.path / filename
38
- yield Document(doc['type'], document_path)
76
+ extension = document_path.suffix
77
+
78
+ # check if the file exists
79
+ if document_path.exists():
80
+ with document_path.open('r') as f:
81
+ content = f.read()
82
+
83
+ yield Document(type=doc['type'], content=content, extension=extension)
84
+ else:
85
+ print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
86
+
87
+ # if loaded from sgml_content
88
+ else:
89
+ yield self.documents[idx]
90
+
91
+ # keep documents by document type
92
+ def keep(self, document_type):
93
+ # Convert single document type to list for consistent handling
94
+ if isinstance(document_type, str):
95
+ document_types = [document_type]
96
+ else:
97
+ document_types = document_type
98
+
99
+ if self.path is not None:
100
+ for doc in self.metadata['documents']:
101
+ filename = doc.get('filename')
102
+ type = doc.get('type')
103
+ if type not in document_types:
104
+ # oh we need handling here for sequences case
105
+ if filename is None:
106
+ filename = doc.sequence + '.txt'
107
+
108
+ document_path = self.path / filename
109
+ # delete the file
110
+ document_path.unlink()
111
+ else:
112
+ print("Warning: keep() method is only available when loading from path.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.1.5
3
+ Version: 1.1.7
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -1,11 +1,11 @@
1
1
  datamule/__init__.py,sha256=l6YlwT5EeRxPlCtO5Jd4I8l266rSRUJyfFe97cRtSCM,991
2
2
  datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
- datamule/document.py,sha256=BC8jdVy9pMOA9ghIqV5N2XJidmVNThqbBohsuSAnVoY,10813
3
+ datamule/document.py,sha256=7FBmjWJJfdKrbQ4UH4J8It7W5GEWTFFEUfQdODUrYlQ,10160
4
4
  datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
5
5
  datamule/index.py,sha256=0txvbzPcvY1GsdxA-wGdLzAByxSeE_1VyyBp9mZEQRM,2292
6
- datamule/portfolio.py,sha256=JmZlTrom_g7FXKXxWp_CiQTyC7p6_cDP08G0kFUja48,6982
7
- datamule/sheet.py,sha256=WwumRdniClGU7W3AXVLOpCdMnepLC7KMrRpQlA6_NUY,1022
8
- datamule/submission.py,sha256=JsxYlEz1Ywu6eC32OS15p4p-p8qB6SWd_rXuf2p5UfY,1247
6
+ datamule/portfolio.py,sha256=ECevaiF8P6v4mJ7W9IM4hRKNF0GGdQzc1SzBWLnG2qQ,7082
7
+ datamule/sheet.py,sha256=FF0JL8BuAZ7Sd_LY_-sCGJuYlhm3sKgj2jlHUGMjeUQ,1406
8
+ datamule/submission.py,sha256=zWCnucjmfTYcr1Hm9Us-TjGLjWAHuRPtIyaVpLNvs4c,4427
9
9
  datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
11
11
  datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
@@ -16,11 +16,11 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
16
16
  datamule/sec/rss/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datamule/sec/rss/monitor.py,sha256=6r4EYaSlGu6VYErlj9zXJsIMLVie1cfacSZU-ESfuBI,18231
18
18
  datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- datamule/sec/submissions/downloader.py,sha256=HxbSkNotLLW6ROmU30rnXPlCo9gY3SoB1Z4ZWvj9FIY,2669
20
- datamule/sec/submissions/eftsquery.py,sha256=v6YMBZzksqweqHnNIllMFN-frWypAgvZPKx2FH1UrL4,22515
21
- datamule/sec/submissions/monitor.py,sha256=XkwH5nvzr_dNttmFRQ52m7344IKbOtWDfOZIEdie4H8,5234
22
- datamule/sec/submissions/streamer.py,sha256=hc61le7gGIIWp1KEaOv_PhriUxf7YYFkQrSKELlZ3pg,9748
23
- datamule/sec/submissions/textsearch.py,sha256=oEIUrcO3HW-4dcyPCiOTvM7UUimNEM4HNIb-Juvc1BQ,4642
19
+ datamule/sec/submissions/downloader.py,sha256=IB08W8-lQD5Bb0LgzrTN4Xi4HsCw24DybRLHqE1AUrU,3290
20
+ datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
21
+ datamule/sec/submissions/monitor.py,sha256=F24I9yn1k8ggbCJQ-Vk7go_qJHlpkBzVKFYKDs_CWLs,5287
22
+ datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
23
+ datamule/sec/submissions/textsearch.py,sha256=-a5yIrrxxtaK10IJeywFmXuJmSndYL9VKm4SC4I9JAs,5808
24
24
  datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
26
26
  datamule/sec/xbrl/filter_xbrl.py,sha256=g9OT4zrNS0tiUJeBIwbCs_zMisOBkpFnMR3tV4Tr39Q,1316
@@ -29,7 +29,7 @@ datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTq
29
29
  datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
30
  datamule/seclibrary/downloader.py,sha256=Zb1TxsIz887tO3MJVP66siYVtNus89ti-g9oZ6VywrM,11500
31
31
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
32
- datamule-1.1.5.dist-info/METADATA,sha256=9Q8YzsBipVuGYN4eWmH49sF5oyouyZvVdJ6rncDa0VE,512
33
- datamule-1.1.5.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
34
- datamule-1.1.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
35
- datamule-1.1.5.dist-info/RECORD,,
32
+ datamule-1.1.7.dist-info/METADATA,sha256=gIryya087eiyvgFA5S5vf2s_wKDxaV3ZEAJA7-W4kS8,512
33
+ datamule-1.1.7.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
34
+ datamule-1.1.7.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
35
+ datamule-1.1.7.dist-info/RECORD,,