PyPI - datamule - Versions diffs - 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl - Mend

datamule 1.1.5py3-none-any.whl → 1.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

datamule/document.py +29 -44
datamule/portfolio.py +6 -2
datamule/sec/submissions/downloader.py +19 -2
datamule/sec/submissions/eftsquery.py +129 -8
datamule/sec/submissions/monitor.py +3 -3
datamule/sec/submissions/streamer.py +59 -23
datamule/sec/submissions/textsearch.py +33 -6
datamule/sheet.py +8 -1
datamule/submission.py +93 -19
{datamule-1.1.5.dist-info → datamule-1.1.7.dist-info}/METADATA +1 -1
{datamule-1.1.5.dist-info → datamule-1.1.7.dist-info}/RECORD +13 -13
{datamule-1.1.5.dist-info → datamule-1.1.7.dist-info}/WHEEL +0 -0
{datamule-1.1.5.dist-info → datamule-1.1.7.dist-info}/top_level.txt +0 -0

datamule/document.py CHANGED Viewed

@@ -8,31 +8,34 @@ from .mapping_dicts.xml_mapping_dicts import dict_345
 from selectolax.parser import HTMLParser
 class Document:
-    def __init__(self, type, filename):
+    def __init__(self, type, content, extension):
         self.type = type
-        self.path = filename
+        # we will remove this later #
+        # make sure extension is in lower case
+        extension = extension.lower()
+        self.content = content
+        if extension == '.txt':
+            self.content = self._preprocess_txt_content()
+        elif extension in ['.htm', '.html']:
+            self.content = self._preprocess_html_content()
+        self.extension = extension
+        # this will be filled by parsed
         self.data = None
-        self.content = None
-    def load_content(self,encoding='utf-8'):
-        with open(self.path, 'r',encoding=encoding) as f:
-            self.content = f.read()
-    def _load_text_content(self):
-        with open(self.path) as f:
-            return f.read().translate(str.maketrans({
+    #_load_text_content
+    def _preprocess_txt_content(self):
+            return self.content.read().translate(str.maketrans({
                 '\xa0': ' ', '\u2003': ' ',
                 '\u2018': "'", '\u2019': "'",
                 '\u201c': '"', '\u201d': '"'
             }))
     # will deprecate this when we add html2dict
-    def _load_html_content(self):
-        with open(self.path,'rb') as f:
-            parser = HTMLParser(f.read(),detect_encoding=True,decode_errors='ignore')
+    def _preprocess_html_content(self):
+        parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
         # Remove hidden elements first
         hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
         for node in hidden_nodes:
@@ -83,20 +86,9 @@ class Document:
             '\u201c': '"', '\u201d': '"'
         }))
-    def _load_file_content(self):
-        if self.path.suffix =='.txt':
-            self.content = self._load_text_content()
-        elif self.path.suffix in ['.html','.htm']:
-            self.content =  self._load_html_content()
-        else:
-            raise ValueError(f"Unsupported file type: {self.path.suffix}")
     def contains_string(self, pattern):
-        """Currently only works for .htm, .html, and .txt files"""
-        if self.path.suffix in ['.htm', '.html', '.txt']:
-            if self.content is None:
-                self.content = self._load_file_content(self.path)
+        """Works for select files"""
+        if self.extension in ['.htm', '.html', '.txt','.xml']:
             return bool(re.search(pattern, self.content))
         return False
@@ -104,15 +96,14 @@ class Document:
     def parse(self):
         mapping_dict = None
-        if self.path.suffix == '.xml':
+        if self.extension == '.xml':
             if self.type in ['3', '4', '5']:
                 mapping_dict = dict_345
-            self.load_content()
             self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
         # will deprecate this when we add html2dict
-        elif self.path.suffix in ['.htm', '.html','.txt']:
-            self._load_file_content()
+        elif self.extension in ['.htm', '.html','.txt']:
             if self.type == '10-K':
                 mapping_dict = dict_10k
@@ -133,18 +124,12 @@ class Document:
         if not self.data:
             self.parse()
-        if output_filename is None:
-            output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
         with open(output_filename, 'w',encoding='utf-8') as f:
             json.dump(self.data, f, indent=2)
     def write_csv(self, output_filename=None, accession_number=None):
         self.parse()
-        if output_filename is None:
-            output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
         with open(output_filename, 'w', newline='') as csvfile:
             if not self.data:
                 return output_filename
@@ -165,7 +150,7 @@ class Document:
                 writer.writeheader()
                 for row in self.data:
                     if accession_number:
-                        row['Accession Number'] = convert_to_dashed_accession(accession_number)
+                        row['Accession Number'] = accession_number
                     writer.writerow(row)
         return output_filename
@@ -225,7 +210,7 @@ class Document:
         # Let's remove XML iterable for now
         # Handle text-based documents
-        if self.path.suffix in ['.txt', '.htm', '.html']:
+        if self.extension in ['.txt', '.htm', '.html']:
             document_data = self.data
             if not document_data:
                 return iter([])
@@ -235,13 +220,13 @@ class Document:
             section_type = None
             if self.type in ['10-K', '10-Q']:
-                mapping_dict = txt_mapping_dicts.dict_10k if self.type == '10-K' else txt_mapping_dicts.dict_10q
+                mapping_dict = dict_10k if self.type == '10-K' else dict_10q
             elif self.type == '8-K':
-                mapping_dict = txt_mapping_dicts.dict_8k
+                mapping_dict = dict_8k
             elif self.type == 'SC 13D':
-                mapping_dict = txt_mapping_dicts.dict_13d
+                mapping_dict = dict_13d
             elif self.type == 'SC 13G':
-                mapping_dict = txt_mapping_dicts.dict_13g
+                mapping_dict = dict_13g
             else:
                 return iter([])

datamule/portfolio.py CHANGED Viewed

@@ -142,7 +142,7 @@ class Portfolio:
                 cik=cik,
                 submission_type=submission_type,
                 filing_date=filing_date,
-                requests_per_second=5, # Revisit this later.
+                requests_per_second=5,
                 accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
             )
@@ -179,4 +179,8 @@ class Portfolio:
             document_types = [document_types]
         for submission in self.submissions:
-            yield from submission.document_type(document_types)
+            yield from submission.document_type(document_types)
+    def keep(self,document_type):
+        for submission in self.__iter__():
+            submission.keep(document_type)

datamule/sec/submissions/downloader.py CHANGED Viewed

@@ -36,7 +36,8 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings")
         print(f"Error processing {accno}: {e}")
         return None
-def download(cik=None, submission_type=None, filing_date=None, requests_per_second=5, output_dir="filings", accession_numbers=None):
+def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
+             requests_per_second=5, output_dir="filings", accession_numbers=None, quiet=False):
     """
     Download SEC EDGAR filings and extract their documents.
@@ -44,12 +45,25 @@ def download(cik=None, submission_type=None, filing_date=None, requests_per_seco
     - cik: CIK number(s) to query for
     - submission_type: Filing type(s) to query for (default: 10-K)
     - filing_date: Date or date range to query for
+    - location: Location code to filter by (e.g., 'CA' for California)
+    - name: Company name to search for (alternative to providing CIK)
     - requests_per_second: Rate limit for SEC requests
     - output_dir: Directory to save documents
     - accession_numbers: Optional list of accession numbers to filter by
+    - quiet: Whether to suppress progress output
     Returns:
     - List of all document paths processed
+    Examples:
+    # Download filings by CIK
+    download(cik="1318605", submission_type="10-K")
+    # Download filings by company name
+    download(name="Tesla", submission_type="10-K")
+    # Download filings with location filter
+    download(name="Apple", location="CA", submission_type="10-K")
     """
     # Make sure output directory exists
@@ -62,9 +76,12 @@ def download(cik=None, submission_type=None, filing_date=None, requests_per_seco
     # Call the stream function with our callback
     return stream(
         cik=cik,
+        name=name,
         submission_type=submission_type,
         filing_date=filing_date,
+        location=location,
         requests_per_second=requests_per_second,
         document_callback=callback_wrapper,
-        accession_numbers=accession_numbers
+        accession_numbers=accession_numbers,
+        quiet=quiet
     )

datamule/sec/submissions/eftsquery.py CHANGED Viewed

@@ -42,6 +42,67 @@ class EFTSQuery:
             await self.session.close()
             self.session = None
+    async def search_name(self, name):
+        """
+        Search for companies by name using the EFTS name search endpoint.
+        Parameters:
+        name (str): Company name to search for
+        Returns:
+        list: List of dictionaries containing company information (entity, id, tickers if available)
+        """
+        if not self.session:
+            raise RuntimeError("No active session. This method must be called within an async context.")
+        url = f"{self.base_url}?keysTyped={name}"
+        if not self.quiet:
+            print(f"Searching for company: {name}")
+        async with self.limiter:
+            try:
+                async with self.session.get(url) as response:
+                    if response.status == 429:
+                        raise RetryException(url)
+                    response.raise_for_status()
+                    content = await response.read()
+                    await self.rate_monitor.add_request(len(content))
+                    data = await response.json()
+                    if 'hits' in data and 'hits' in data['hits']:
+                        hits = data['hits']['hits']
+                        results = []
+                        for hit in hits:
+                            source = hit.get('_source', {})
+                            result = {
+                                'entity': source.get('entity', ''),
+                                'id': hit.get('_id', ''),
+                                'tickers': source.get('tickers', '')
+                            }
+                            results.append(result)
+                        if not self.quiet and results:
+                            # Create a compact display of results
+                            display_results = [f"{r['entity']} [{r['id']}]" for r in results]
+                            print(f"Name matches: {', '.join(display_results[:5])}")
+                            if len(results) > 5:
+                                print(f"...and {len(results) - 5} more matches")
+                        return results
+                    return []
+            except aiohttp.ClientResponseError as e:
+                if e.status == 429:
+                    raise RetryException(url)
+                if not self.quiet:
+                    print(f"Error searching for company: {str(e)}")
+                return []
+            except Exception as e:
+                if not self.quiet:
+                    print(f"Error searching for company: {str(e)}")
+                return []
     def _get_form_exclusions(self, form):
         """Dynamically generate form exclusions based on patterns"""
         # Skip already negated forms
@@ -55,7 +116,7 @@ class EFTSQuery:
         # No exclusions for amendment forms
         return []
-    def _prepare_params(self, cik=None, submission_type=None, filing_date=None):
+    def _prepare_params(self, cik=None, submission_type=None, filing_date=None, location=None):
         params = {}
         # Handle CIK
@@ -111,6 +172,10 @@ class EFTSQuery:
             params['startdt'] = "2001-01-01"
             params['enddt'] = datetime.now().strftime('%Y-%m-%d')
+        # Handle location filtering
+        if location:
+            params['filter_location'] = location
         return params
     def _get_query_description(self, params):
@@ -125,6 +190,9 @@ class EFTSQuery:
         if 'startdt' in params and 'enddt' in params:
             parts.append(f"dates={params['startdt']} to {params['enddt']}")
+        if 'filter_location' in params:
+            parts.append(f"location={params['filter_location']}")
         return ", ".join(parts)
     async def _fetch_json(self, url):
@@ -413,12 +481,26 @@ class EFTSQuery:
         for params, from_val, size_val, callback in self.pending_page_requests:
             await self.fetch_queue.put((params, from_val, size_val, callback))
-    async def query(self, cik=None, submission_type=None, filing_date=None, callback=None):
-        params = self._prepare_params(cik, submission_type, filing_date)
-        all_hits = []
+    async def query(self, cik=None, submission_type=None, filing_date=None, location=None, callback=None, name=None):
+        """
+        Query SEC filings using the EFTS API.
+        Parameters:
+        cik (str or list): Central Index Key(s) for the company
+        submission_type (str or list): Filing form type(s) to filter by
+        filing_date (str, tuple, or list): Date or date range to filter by
+        location (str): Location code to filter by (e.g., 'CA' for California)
+        callback (function): Async callback function to process results as they arrive
+        name (str): Company name to search for (alternative to providing CIK)
-        # Check if this is a primary documents query
-        self.was_primary_docs_query = '-0' in params.get('forms', '').split(',')
+        Returns:
+        list: List of filing documents matching the query criteria
+        """
+        # If both CIK and name are provided, raise an error
+        if cik is not None and name is not None:
+            raise ValueError("Please provide either 'name' or 'cik', not both")
+        all_hits = []
         # Collector callback to gather all hits
         async def collect_hits(hits):
@@ -427,6 +509,25 @@ class EFTSQuery:
                 await callback(hits)
         async with self as client:
+            # If name is provided, search for matching companies inside the context manager
+            if name is not None:
+                company_results = await self.search_name(name)
+                if not company_results:
+                    if not self.quiet:
+                        print(f"No companies found matching: {name}")
+                    return []
+                # Use the first (best) match's CIK
+                cik = company_results[0]['id']
+                if not self.quiet:
+                    print(f"Using CIK {cik} for {company_results[0]['entity']}")
+            # Now prepare parameters with the CIK (either provided directly or from name search)
+            params = self._prepare_params(cik, submission_type, filing_date, location)
+            # Check if this is a primary documents query
+            self.was_primary_docs_query = '-0' in params.get('forms', '').split(',')
             # Reset state for new query
             self.total_results_to_fetch = 0
             self.pending_page_requests = []
@@ -506,12 +607,32 @@ class EFTSQuery:
                 print(f"\n--- Query complete: {len(all_hits):,} submissions retrieved ---")
             return all_hits
-def query_efts(cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, callback=None, quiet=False):
+def query_efts(cik=None, submission_type=None, filing_date=None, location=None, requests_per_second=5.0, callback=None, quiet=False, name=None):
     """
     Convenience function to run a query without managing the async context.
+    Parameters:
+    cik (str or list): Central Index Key(s) for the company
+    submission_type (str or list): Filing form type(s) to filter by
+    filing_date (str, tuple, or list): Date or date range to filter by
+    location (str): Location code to filter by (e.g., 'CA' for California)
+    requests_per_second (float): Maximum requests per second to make to the SEC API
+    callback (function): Async callback function to process results as they arrive
+    quiet (bool): Whether to suppress progress output
+    name (str): Company name to search for (alternative to providing CIK)
+    Returns:
+    list: List of filing documents matching the query criteria
+    Example:
+    To search by company name:
+        results = query_efts(name="Tesla", submission_type="10-K")
+    To search by CIK:
+        results = query_efts(cik="1318605", submission_type="10-K")
     """
     async def run_query():
         query = EFTSQuery(requests_per_second=requests_per_second, quiet=quiet)
-        return await query.query(cik, submission_type, filing_date, callback)
+        return await query.query(cik, submission_type, filing_date, location, callback, name)
     return asyncio.run(run_query())

datamule/sec/submissions/monitor.py CHANGED Viewed

@@ -5,7 +5,7 @@ from ..rss.monitor import start_monitor  # Import start_monitor directly
 import pytz
-async def _process_efts_hits(hits, collected_accession_numbers, data_callback=None):
+async def _process_efts_hits(hits, collected_accession_numbers, data_callback=None,rate_limiter=None):
     """Process EFTS hits, collect accession numbers, and call data callback."""
     processed_hits = []
@@ -36,7 +36,7 @@ async def _process_efts_hits(hits, collected_accession_numbers, data_callback=No
     # Call data callback if provided
     if data_callback and processed_hits:
-        await data_callback(processed_hits)
+        await data_callback(processed_hits, rate_limiter)
     return processed_hits
@@ -61,7 +61,7 @@ async def _master_monitor_impl(data_callback=None, poll_callback=None, submissio
     # Prepare a wrapper callback to collect accession numbers
     async def process_callback(hits):
-        await _process_efts_hits(hits, collected_accession_numbers, data_callback)
+         await _process_efts_hits(hits, collected_accession_numbers, data_callback, efts_query.limiter)
     # Create an EFTSQuery instance
     efts_query = EFTSQuery(requests_per_second=requests_per_second)

datamule/sec/submissions/streamer.py CHANGED Viewed

@@ -21,8 +21,8 @@ def fix_filing_url(url):
     return url
 class Streamer(EFTSQuery):
-    def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None):
-        super().__init__(requests_per_second=requests_per_second)
+    def __init__(self, requests_per_second=5.0, document_callback=None, accession_numbers=None, quiet=False):
+        super().__init__(requests_per_second=requests_per_second, quiet=quiet)
         self.document_callback = document_callback
         self.document_queue = asyncio.Queue()
         self.download_in_progress = asyncio.Event()
@@ -57,12 +57,14 @@ class Streamer(EFTSQuery):
                             await callback(hits)
                     self.fetch_queue.task_done()
                 except Exception as e:
-                    print(f"\nError fetching {url}: {str(e)}")
+                    if not self.quiet:
+                        print(f"\nError fetching {url}: {str(e)}")
                     self.fetch_queue.task_done()
             except asyncio.CancelledError:
                 break
             except Exception as e:
-                print(f"\nWorker error: {str(e)}")
+                if not self.quiet:
+                    print(f"\nWorker error: {str(e)}")
                 self.fetch_queue.task_done()
     def _construct_submission_url(self, hit):
@@ -85,7 +87,8 @@ class Streamer(EFTSQuery):
             return url, cik, accno_w_dash
         except (KeyError, IndexError) as e:
-            print(f"Error constructing URL for hit: {hit}. Error: {str(e)}")
+            if not self.quiet:
+                print(f"Error constructing URL for hit: {hit}. Error: {str(e)}")
             return None, None, None
     async def _document_download_worker(self):
@@ -115,13 +118,15 @@ class Streamer(EFTSQuery):
                     self.document_queue.task_done()
                 except Exception as e:
-                    print(f"\nError streaming document {doc_url}: {str(e)}")
+                    if not self.quiet:
+                        print(f"\nError streaming document {doc_url}: {str(e)}")
                     self.document_queue.task_done()
             except asyncio.CancelledError:
                 break
             except Exception as e:
-                print(f"\nDocument worker error: {str(e)}")
+                if not self.quiet:
+                    print(f"\nDocument worker error: {str(e)}")
                 self.document_queue.task_done()
     async def document_download_callback(self, hits):
@@ -133,7 +138,7 @@ class Streamer(EFTSQuery):
         self.download_in_progress.set()
         # Create progress bar for documents if not exists
-        if not self.document_pbar:
+        if not self.document_pbar and not self.quiet:
             self.document_pbar = tqdm(total=0, desc="Streaming submissions")
         # Queue up the documents for download
@@ -141,7 +146,8 @@ class Streamer(EFTSQuery):
             doc_url, cik, accno = self._construct_submission_url(hit)
             if doc_url:
                 # Update document progress bar total
-                self.document_pbar.total += 1
+                if self.document_pbar:
+                    self.document_pbar.total += 1
                 self.total_documents += 1
                 # Add to download queue
@@ -159,8 +165,20 @@ class Streamer(EFTSQuery):
         # Signal that document download is complete
         self.download_in_progress.clear()
-    async def stream(self, cik=None, submission_type=None, filing_date=None):
-        """Main method to stream EFTS results and download documents"""
+    async def stream(self, cik=None, submission_type=None, filing_date=None, location=None, name=None):
+        """
+        Main method to stream EFTS results and download documents
+        Parameters:
+        cik (str or list): Central Index Key(s) for the company
+        submission_type (str or list): Filing form type(s) to filter by
+        filing_date (str, tuple, or list): Date or date range to filter by
+        location (str): Location code to filter by (e.g., 'CA' for California)
+        name (str): Company name to search for (alternative to providing CIK)
+        Returns:
+        list: List of all EFTS hits processed
+        """
         # Create document worker tasks
         self.document_workers = [
             asyncio.create_task(self._document_download_worker())
@@ -173,11 +191,12 @@ class Streamer(EFTSQuery):
         self.skipped_documents = 0
         # Run the main query with our document download callback
-        results = await self.query(cik, submission_type, filing_date, self.document_download_callback)
+        results = await self.query(cik, submission_type, filing_date, location, self.document_download_callback, name)
         # Make sure all document downloads are complete
         if self.download_in_progress.is_set():
-            print("Waiting for remaining document downloads to complete...")
+            if not self.quiet:
+                print("Waiting for remaining document downloads to complete...")
             await self.document_queue.join()
         # Clean up document workers
@@ -190,14 +209,17 @@ class Streamer(EFTSQuery):
         if self.document_pbar:
             self.document_pbar.close()
             self.document_pbar = None  # Set to None to prevent reuse
-        print(f"\n--- Streaming complete: {len(results)} EFTS results processed ---")
-        if self.accession_numbers is not None:
-            print(f"--- {self.documents_processed} documents downloaded, {self.skipped_documents} skipped due to accession number filter ---")
+        if not self.quiet:
+            print(f"\n--- Streaming complete: {len(results)} EFTS results processed ---")
+            if self.accession_numbers is not None:
+                print(f"--- {self.documents_processed} documents downloaded, {self.skipped_documents} skipped due to accession number filter ---")
         return results
-def stream(cik=None, submission_type=None, filing_date=None,
-                requests_per_second=5.0, document_callback=None, accession_numbers=None):
+def stream(cik=None, submission_type=None, filing_date=None, location=None,
+           requests_per_second=5.0, document_callback=None, accession_numbers=None,
+           quiet=False, name=None):
     """
     Stream EFTS results and download documents into memory.
@@ -205,15 +227,28 @@ def stream(cik=None, submission_type=None, filing_date=None,
     - cik: CIK number(s) to query for
     - submission_type: Filing type(s) to query for
     - filing_date: Date or date range to query for
+    - location: Location code to filter by (e.g., 'CA' for California)
     - requests_per_second: Rate limit for SEC requests (combined EFTS and document downloads)
     - document_callback: Callback function that receives (hit, content, cik, accno, url)
     - accession_numbers: Optional list of accession numbers to filter by
+    - quiet: Whether to suppress progress output
+    - name: Company name to search for (alternative to providing CIK)
     Returns:
     - List of all EFTS hits processed
+    Example:
+    To search by company name:
+        results = stream(name="Tesla", submission_type="10-K")
+    To search by CIK:
+        results = stream(cik="1318605", submission_type="10-K")
+    To search with location filter:
+        results = stream(name="Tesla", location="CA", submission_type="10-K")
     """
-    # check if acc no is empty list
+    # Check if acc no is empty list
     if accession_numbers == []:
         raise ValueError("Applied filter resulted in empty accession numbers list")
@@ -221,8 +256,9 @@ def stream(cik=None, submission_type=None, filing_date=None,
         streamer = Streamer(
             requests_per_second=requests_per_second,
             document_callback=document_callback,
-            accession_numbers=accession_numbers
+            accession_numbers=accession_numbers,
+            quiet=quiet
         )
-        return await streamer.stream(cik, submission_type, filing_date)
+        return await streamer.stream(cik, submission_type, filing_date, location, name)
     return asyncio.run(run_stream())

datamule/sec/submissions/textsearch.py CHANGED Viewed

@@ -13,9 +13,9 @@ class TextSearchEFTSQuery(EFTSQuery):
         super().__init__(requests_per_second=requests_per_second, quiet=quiet)
         self.text_query = text_query
-    def _prepare_params(self, cik=None, submission_type=None, filing_date=None):
+    def _prepare_params(self, cik=None, submission_type=None, filing_date=None, location=None):
         # Get base parameters from parent class
-        params = super()._prepare_params(cik, submission_type, filing_date)
+        params = super()._prepare_params(cik, submission_type, filing_date, location)
         # Add text query parameter
         params['q'] = self.text_query
@@ -46,7 +46,8 @@ async def extract_accession_numbers(hits):
                 accession_numbers.append(acc_no)
     return accession_numbers
-def query(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, quiet=False):
+def query(text_query, cik=None, submission_type=None, filing_date=None, location=None,
+          name=None, requests_per_second=5.0, quiet=False):
     """
     Search SEC filings for text and return the full search results.
@@ -63,6 +64,10 @@ def query(text_query, cik=None, submission_type=None, filing_date=None, requests
     filing_date : str, tuple, list, optional
         Date or date range to filter by. Can be a single date string ('YYYY-MM-DD'),
         a tuple of (start_date, end_date), or a list of dates.
+    location : str, optional
+        Location code to filter by (e.g., 'CA' for California).
+    name : str, optional
+        Company name to search for (alternative to providing CIK).
     requests_per_second : float, optional
         Maximum number of requests per second to make to the SEC API.
         Default is 5.0.
@@ -73,14 +78,23 @@ def query(text_query, cik=None, submission_type=None, filing_date=None, requests
     --------
     list
         Complete search results with all hit data.
+    Examples:
+    ---------
+    # Search for 'climate risk' in Tesla's 10-K filings using company name
+    results = query('"climate risk"', name='Tesla', submission_type='10-K')
+    # Search for 'pandemic' in California companies' filings
+    results = query('pandemic', location='CA', submission_type='8-K')
     """
     async def run_query():
         query = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second, quiet=quiet)
-        return await query.query(cik, submission_type, filing_date)
+        return await query.query(cik, submission_type, filing_date, location, None, name)
     return asyncio.run(run_query())
-def filter_text(text_query, cik=None, submission_type=None, filing_date=None, requests_per_second=5.0, quiet=False):
+def filter_text(text_query, cik=None, submission_type=None, filing_date=None, location=None,
+                name=None, requests_per_second=5.0, quiet=False):
     """
     Search SEC filings for text and return matching accession numbers.
@@ -97,6 +111,10 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
     filing_date : str, tuple, list, optional
         Date or date range to filter by. Can be a single date string ('YYYY-MM-DD'),
         a tuple of (start_date, end_date), or a list of dates.
+    location : str, optional
+        Location code to filter by (e.g., 'CA' for California).
+    name : str, optional
+        Company name to search for (alternative to providing CIK).
     requests_per_second : float, optional
         Maximum number of requests per second to make to the SEC API.
         Default is 5.0.
@@ -107,6 +125,15 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
     --------
     list
         List of accession numbers (as strings) for filings that match the text query.
+    Examples:
+    ---------
+    # Get accession numbers of Apple filings mentioning 'supply chain'
+    acc_numbers = filter_text('"supply chain"', name='Apple')
+    # Use the accession numbers as a filter in another API
+    from .downloader import download
+    download(name='Apple', accession_numbers=acc_numbers)
     """
     async def run_query():
         query_obj = TextSearchEFTSQuery(text_query, requests_per_second=requests_per_second, quiet=quiet)
@@ -119,7 +146,7 @@ def filter_text(text_query, cik=None, submission_type=None, filing_date=None, re
             all_acc_nos.extend(acc_nos)
         # Run the query with our callback
-        await query_obj.query(cik, submission_type, filing_date, collect_acc_nos)
+        await query_obj.query(cik, submission_type, filing_date, location, collect_acc_nos, name)
         return all_acc_nos

datamule/sheet.py CHANGED Viewed

@@ -30,5 +30,12 @@ class Sheet:
         pass
     def query_xbrl():
         pass
-    def query_13fhr():
+    # LIST TUPLE SYNTAX, so e.g. value (0,100) is 0-100, while [0,100] is 0 and 100
+    def get_13fhr(reportingOwnerCIK,nameOfIssuer,titleOfClass,cusip,value,
+                shrsOrPrnAmt_sshPrnamt,shrsOrPrnAmt_sshPrnamtType,investmentDiscretion,otherManager,
+                votingAuthority_Sole,
+                votingAuthority_Shared,
+                votingAuthority_None,
+                filing_date):
         pass

datamule/submission.py CHANGED Viewed

@@ -1,16 +1,37 @@
 from pathlib import Path
 import json
 from .document import Document
+from secsgml import parse_sgml_submission_into_memory
+from pathlib import Path
 class Submission:
-    def __init__(self, path):
-        self.path = Path(path)
-        self._load_metadata()
+    def __init__(self, path=None,sgml_content=None,keep_document_types=None):
+        if path is None and sgml_content is None:
+            raise ValueError("Either path or sgml_content must be provided")
+        if path is not None and sgml_content is not None:
+            raise ValueError("Only one of path or sgml_content must be provided")
+        if sgml_content is not None:
+            self.path = None
+            self.metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
+            for idx,doc in enumerate(self.metadata['documents']):
+                type = doc.get('type')
+                # Keep only specified types
+                if keep_document_types is not None and type not in keep_document_types:
+                    continue
+                filename = doc.get('filename')
+                extension = Path(filename).suffix
+                self.documents = [Document(type=type, content=raw_documents[idx], extension=extension)]
+        if path is not None:
+            self.path = Path(path)
+            metadata_path = self.path / 'metadata.json'
+            with metadata_path.open('r') as f:
+                self.metadata = json.load(f)
-    def _load_metadata(self):
-        metadata_path = self.path / 'metadata.json'
-        with metadata_path.open('r') as f:
-            self.metadata = json.load(f)
     def document_type(self, document_type):
         # Convert single document type to list for consistent handling
@@ -19,20 +40,73 @@ class Submission:
         else:
             document_types = document_type
-        for doc in self.metadata['documents']:
+        for idx,doc in enumerate(self.metadata['documents']):
             if doc['type'] in document_types:
+                # if loaded from path
+                if self.path is not None:
+                    filename = doc.get('filename')
+                    # oh we need handling here for sequences case
+                    if filename is None:
+                        filename = doc['sequence'] + '.txt'
+                    document_path = self.path / filename
+                    extension = document_path.suffix
+                    with document_path.open('r') as f:
+                        content = f.read()
+                    yield Document(type=doc['type'], content=content, extension=extension)
+                # if loaded from sgml_content
+                else:
+                    yield self.documents[idx]
+    def __iter__(self):
+        for idx,doc in enumerate(self.metadata['documents']):
+            # if loaded from path
+            if self.path is not None:
                 filename = doc.get('filename')
+                # oh we need handling here for sequences case
                 if filename is None:
-                    continue
+                    filename = doc['sequence'] + '.txt'
                 document_path = self.path / filename
-                yield Document(doc['type'], document_path)
-    def __iter__(self):
-        for doc in self.metadata['documents']:
-            filename = doc.get('filename')
-            if filename is None:
-                continue
-            document_path = self.path / filename
-            yield Document(doc['type'], document_path)
+                extension = document_path.suffix
+                # check if the file exists
+                if document_path.exists():
+                    with document_path.open('r') as f:
+                        content = f.read()
+                    yield Document(type=doc['type'], content=content, extension=extension)
+                else:
+                    print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
+            # if loaded from sgml_content
+            else:
+                yield self.documents[idx]
+    # keep documents by document type
+    def keep(self, document_type):
+        # Convert single document type to list for consistent handling
+        if isinstance(document_type, str):
+            document_types = [document_type]
+        else:
+            document_types = document_type
+        if self.path is not None:
+            for doc in self.metadata['documents']:
+                filename = doc.get('filename')
+                type = doc.get('type')
+                if type not in document_types:
+                    # oh we need handling here for sequences case
+                    if filename is None:
+                        filename = doc.sequence + '.txt'
+                    document_path = self.path / filename
+                    # delete the file
+                    document_path.unlink()
+        else:
+            print("Warning: keep() method is only available when loading from path.")

{datamule-1.1.5.dist-info → datamule-1.1.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.1.5
+Version: 1.1.7
 Summary: Making it easier to use SEC filings.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.1.5.dist-info → datamule-1.1.7.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
 datamule/__init__.py,sha256=l6YlwT5EeRxPlCtO5Jd4I8l266rSRUJyfFe97cRtSCM,991
 datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
-datamule/document.py,sha256=BC8jdVy9pMOA9ghIqV5N2XJidmVNThqbBohsuSAnVoY,10813
+datamule/document.py,sha256=7FBmjWJJfdKrbQ4UH4J8It7W5GEWTFFEUfQdODUrYlQ,10160
 datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
 datamule/index.py,sha256=0txvbzPcvY1GsdxA-wGdLzAByxSeE_1VyyBp9mZEQRM,2292
-datamule/portfolio.py,sha256=JmZlTrom_g7FXKXxWp_CiQTyC7p6_cDP08G0kFUja48,6982
-datamule/sheet.py,sha256=WwumRdniClGU7W3AXVLOpCdMnepLC7KMrRpQlA6_NUY,1022
-datamule/submission.py,sha256=JsxYlEz1Ywu6eC32OS15p4p-p8qB6SWd_rXuf2p5UfY,1247
+datamule/portfolio.py,sha256=ECevaiF8P6v4mJ7W9IM4hRKNF0GGdQzc1SzBWLnG2qQ,7082
+datamule/sheet.py,sha256=FF0JL8BuAZ7Sd_LY_-sCGJuYlhm3sKgj2jlHUGMjeUQ,1406
+datamule/submission.py,sha256=zWCnucjmfTYcr1Hm9Us-TjGLjWAHuRPtIyaVpLNvs4c,4427
 datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
 datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
@@ -16,11 +16,11 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
 datamule/sec/rss/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/sec/rss/monitor.py,sha256=6r4EYaSlGu6VYErlj9zXJsIMLVie1cfacSZU-ESfuBI,18231
 datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/sec/submissions/downloader.py,sha256=HxbSkNotLLW6ROmU30rnXPlCo9gY3SoB1Z4ZWvj9FIY,2669
-datamule/sec/submissions/eftsquery.py,sha256=v6YMBZzksqweqHnNIllMFN-frWypAgvZPKx2FH1UrL4,22515
-datamule/sec/submissions/monitor.py,sha256=XkwH5nvzr_dNttmFRQ52m7344IKbOtWDfOZIEdie4H8,5234
-datamule/sec/submissions/streamer.py,sha256=hc61le7gGIIWp1KEaOv_PhriUxf7YYFkQrSKELlZ3pg,9748
-datamule/sec/submissions/textsearch.py,sha256=oEIUrcO3HW-4dcyPCiOTvM7UUimNEM4HNIb-Juvc1BQ,4642
+datamule/sec/submissions/downloader.py,sha256=IB08W8-lQD5Bb0LgzrTN4Xi4HsCw24DybRLHqE1AUrU,3290
+datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
+datamule/sec/submissions/monitor.py,sha256=F24I9yn1k8ggbCJQ-Vk7go_qJHlpkBzVKFYKDs_CWLs,5287
+datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
+datamule/sec/submissions/textsearch.py,sha256=-a5yIrrxxtaK10IJeywFmXuJmSndYL9VKm4SC4I9JAs,5808
 datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
 datamule/sec/xbrl/filter_xbrl.py,sha256=g9OT4zrNS0tiUJeBIwbCs_zMisOBkpFnMR3tV4Tr39Q,1316
@@ -29,7 +29,7 @@ datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTq
 datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/seclibrary/downloader.py,sha256=Zb1TxsIz887tO3MJVP66siYVtNus89ti-g9oZ6VywrM,11500
 datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
-datamule-1.1.5.dist-info/METADATA,sha256=9Q8YzsBipVuGYN4eWmH49sF5oyouyZvVdJ6rncDa0VE,512
-datamule-1.1.5.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-datamule-1.1.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-1.1.5.dist-info/RECORD,,
+datamule-1.1.7.dist-info/METADATA,sha256=gIryya087eiyvgFA5S5vf2s_wKDxaV3ZEAJA7-W4kS8,512
+datamule-1.1.7.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+datamule-1.1.7.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-1.1.7.dist-info/RECORD,,

{datamule-1.1.5.dist-info → datamule-1.1.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-1.1.5.dist-info → datamule-1.1.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamule 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl

datamule 1.1.5py3-none-any.whl → 1.1.7py3-none-any.whl