PyPI - datamule - Versions diffs - 1.1.8__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

datamule 1.1.8py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

datamule/__init__.py +3 -1
datamule/document/__init__.py +0 -0
datamule/document/document.py +255 -0
datamule/document/processing.py +604 -0
datamule/document/table.py +260 -0
datamule/package_updater.py +31 -0
datamule/portfolio.py +5 -3
datamule/sec/submissions/downloader.py +14 -37
datamule/seclibrary/bq.py +349 -12
datamule/seclibrary/downloader.py +50 -9
datamule/sheet.py +458 -34
datamule/submission.py +102 -7
{datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/METADATA +1 -1
{datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/RECORD +16 -12
{datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/WHEEL +1 -1
datamule/document.py +0 -472
{datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/top_level.txt +0 -0

datamule/sheet.py CHANGED Viewed

@@ -3,7 +3,7 @@ import csv
 import os
 from .helper import _process_cik_and_metadata_filters, load_package_dataset
 from .sec.xbrl.downloadcompanyfacts import download_company_facts
-from .seclibrary.bq import get_information_table
+from .seclibrary.bq import get_information_table, get_345, get_proxy_voting_record
 class Sheet:
     def __init__(self, path):
@@ -31,9 +31,6 @@ class Sheet:
     def get_information_table(
         self,
-        # Required parameters
-        table_type="INFORMATION_TABLE",
         # Optional filtering parameters
         columns=None,
         name_of_issuer=None,
@@ -65,8 +62,6 @@ class Sheet:
         Parameters:
         -----------
-        table_type : str
-            The table to query (default is "INFORMATION_TABLE")
         columns : List[str], optional
             Specific columns to return. If None, all columns are returned.
@@ -97,7 +92,6 @@ class Sheet:
         """
         return get_information_table(
-            table_type=table_type,
             columns=columns,
             name_of_issuer=name_of_issuer,
             title_of_class=title_of_class,
@@ -124,12 +118,164 @@ class Sheet:
             verbose=verbose
         )
+    def get_345(
+        self,
+        # Optional filtering parameters
+        columns=None,
+        is_derivative=None,
+        is_non_derivative=None,
+        security_title=None,
+        transaction_date=None,
+        document_type=None,
+        transaction_code=None,
+        equity_swap_involved=None,
+        transaction_timeliness=None,
+        transaction_shares=None,
+        transaction_price_per_share=None,
+        shares_owned_following_transaction=None,
+        ownership_type=None,
+        deemed_execution_date=None,
+        conversion_or_exercise_price=None,
+        exercise_date=None,
+        expiration_date=None,
+        underlying_security_title=None,
+        underlying_security_shares=None,
+        underlying_security_value=None,
+        accession=None,
+        reporting_owner_cik=None,
+        issuer_cik=None,
+        filing_date=None,
+        # API key handling
+        api_key=None,
+        # Additional options
+        print_cost=True,
+        verbose=False
+    ):
+        """
+        Query the SEC BigQuery API for Form 345 insider transaction data.
+        Parameters:
+        -----------
+        columns : List[str], optional
+            Specific columns to return. If None, all columns are returned.
+        # Filter parameters
+        is_derivative, security_title, etc. : Various filters that can be:
+            - str/bool: Exact match
+            - List[str]: Match any in list
+            - tuple: (min, max) range for numeric/date fields
+        reporting_owner_cik : str or List[str]
+            CIK(s) of the reporting insider(s). This is matched against an array in BigQuery.
+            Any match within the array will return the record.
+        issuer_cik : str or List[str]
+            CIK(s) of the company/companies
+        api_key : str, optional
+            SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
+        print_cost : bool
+            Whether to print the query cost information
+        verbose : bool
+            Whether to print additional information about the query
+        Returns:
+        --------
+        List[Dict]
+            A list of dictionaries containing the query results
+        Raises:
+        -------
+        ValueError
+            If API key is missing or invalid
+        Exception
+            For API errors or other issues
+        """
+        return get_345(
+            columns=columns,
+            is_derivative=is_derivative,
+            is_non_derivative=is_non_derivative,
+            security_title=security_title,
+            transaction_date=transaction_date,
+            document_type=document_type,
+            transaction_code=transaction_code,
+            equity_swap_involved=equity_swap_involved,
+            transaction_timeliness=transaction_timeliness,
+            transaction_shares=transaction_shares,
+            transaction_price_per_share=transaction_price_per_share,
+            shares_owned_following_transaction=shares_owned_following_transaction,
+            ownership_type=ownership_type,
+            deemed_execution_date=deemed_execution_date,
+            conversion_or_exercise_price=conversion_or_exercise_price,
+            exercise_date=exercise_date,
+            expiration_date=expiration_date,
+            underlying_security_title=underlying_security_title,
+            underlying_security_shares=underlying_security_shares,
+            underlying_security_value=underlying_security_value,
+            accession=accession,
+            reporting_owner_cik=reporting_owner_cik,
+            issuer_cik=issuer_cik,
+            filing_date=filing_date,
+            # API key handling
+            api_key=api_key,
+            # Additional options
+            print_cost=print_cost,
+            verbose=verbose
+        )
+    def _download_to_csv(self, data, filepath, verbose=False):
+        """
+        Helper method to download data to a CSV file.
+        Parameters:
+        -----------
+        data : List[Dict]
+            The data to save
+        filepath : str or Path
+            Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
+        verbose : bool
+            Whether to print additional information
+        Returns:
+        --------
+        List[Dict]
+            The input data (for method chaining)
+        """
+        # If no data returned, nothing to save
+        if not data:
+            if verbose:
+                print("No data returned from API. No file was created.")
+            return data
+        # Resolve filepath - if it's not absolute, make it relative to self.path
+        filepath_obj = Path(filepath)
+        if not filepath_obj.is_absolute():
+            filepath_obj = self.path / filepath_obj
+        # Create directory if it doesn't exist
+        os.makedirs(filepath_obj.parent, exist_ok=True)
+        # Get fieldnames from the first record
+        fieldnames = data[0].keys()
+        # Write to CSV
+        with open(filepath_obj, 'w', newline='') as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(data)
+        if verbose:
+            print(f"Saved {len(data)} records to {filepath_obj}")
     def download_information_table(
         self,
         filepath,
-        # Required parameters
-        table_type="INFORMATION_TABLE",
         # Optional filtering parameters
         columns=None,
         name_of_issuer=None,
@@ -164,8 +310,6 @@ class Sheet:
         filepath : str
             Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
-        table_type : str
-            The table to query (default is "INFORMATION_TABLE")
         columns : List[str], optional
             Specific columns to return. If None, all columns are returned.
@@ -196,7 +340,6 @@ class Sheet:
         """
         # Get the data from the API
         data = self.get_information_table(
-            table_type=table_type,
             columns=columns,
             name_of_issuer=name_of_issuer,
             title_of_class=title_of_class,
@@ -219,30 +362,311 @@ class Sheet:
             verbose=verbose
         )
-        # If no data returned, nothing to save
-        if not data:
-            if verbose:
-                print("No data returned from API. No file was created.")
-            return data
+        # Save to CSV using the helper method
+        return self._download_to_csv(data, filepath, verbose)
+    def download_345(
+        self,
+        filepath,
+        # Optional filtering parameters
+        columns=None,
+        is_derivative=None,
+        is_non_derivative=None,
+        security_title=None,
+        transaction_date=None,
+        document_type=None,
+        transaction_code=None,
+        equity_swap_involved=None,
+        transaction_timeliness=None,
+        transaction_shares=None,
+        transaction_price_per_share=None,
+        shares_owned_following_transaction=None,
+        ownership_type=None,
+        deemed_execution_date=None,
+        conversion_or_exercise_price=None,
+        exercise_date=None,
+        expiration_date=None,
+        underlying_security_title=None,
+        underlying_security_shares=None,
+        underlying_security_value=None,
+        accession=None,
+        reporting_owner_cik=None,
+        issuer_cik=None,
+        filing_date=None,
-        # Resolve filepath - if it's not absolute, make it relative to self.path
-        filepath_obj = Path(filepath)
-        if not filepath_obj.is_absolute():
-            filepath_obj = self.path / filepath_obj
+        # API key handling
+        api_key=None,
-        # Create directory if it doesn't exist
-        os.makedirs(filepath_obj.parent, exist_ok=True)
+        # Additional options
+        print_cost=True,
+        verbose=False
+    ):
+        """
+        Query the SEC BigQuery API for Form 345 insider transaction data and save to CSV.
-        # Get fieldnames from the first record
-        fieldnames = data[0].keys()
+        Parameters:
+        -----------
+        filepath : str
+            Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
-        # Write to CSV
-        with open(filepath_obj, 'w', newline='') as csvfile:
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerows(data)
+        columns : List[str], optional
+            Specific columns to return. If None, all columns are returned.
+        # Filter parameters
+        is_derivative, security_title, etc. : Various filters that can be:
+            - str/bool: Exact match
+            - List[str]: Match any in list
+            - tuple: (min, max) range for numeric/date fields
-        if verbose:
-            print(f"Saved {len(data)} records to {filepath_obj}")
+        reporting_owner_cik : str or List[str]
+            CIK(s) of the reporting insider(s). This is matched against an array in BigQuery.
+            Any match within the array will return the record.
+        issuer_cik : str or List[str]
+            CIK(s) of the company/companies
+        api_key : str, optional
+            SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
+        print_cost : bool
+            Whether to print the query cost information
+        verbose : bool
+            Whether to print additional information about the query
-        return data
+        Returns:
+        --------
+        List[Dict]
+            A list of dictionaries containing the query results
+        Raises:
+        -------
+        ValueError
+            If API key is missing or invalid
+        Exception
+            For API errors or other issues
+        """
+        # Get the data from the API
+        data = self.get_345(
+            columns=columns,
+            is_derivative=is_derivative,
+            is_non_derivative=is_non_derivative,
+            security_title=security_title,
+            transaction_date=transaction_date,
+            document_type=document_type,
+            transaction_code=transaction_code,
+            equity_swap_involved=equity_swap_involved,
+            transaction_timeliness=transaction_timeliness,
+            transaction_shares=transaction_shares,
+            transaction_price_per_share=transaction_price_per_share,
+            shares_owned_following_transaction=shares_owned_following_transaction,
+            ownership_type=ownership_type,
+            deemed_execution_date=deemed_execution_date,
+            conversion_or_exercise_price=conversion_or_exercise_price,
+            exercise_date=exercise_date,
+            expiration_date=expiration_date,
+            underlying_security_title=underlying_security_title,
+            underlying_security_shares=underlying_security_shares,
+            underlying_security_value=underlying_security_value,
+            accession=accession,
+            reporting_owner_cik=reporting_owner_cik,
+            issuer_cik=issuer_cik,
+            filing_date=filing_date,
+            api_key=api_key,
+            print_cost=print_cost,
+            verbose=verbose
+        )
+        # Save to CSV using the helper method
+        return self._download_to_csv(data, filepath, verbose)
+    def get_proxy_voting_record(
+        self,
+        # Optional filtering parameters
+        columns=None,
+        meeting_date=None,
+        isin=None,
+        cusip=None,
+        issuer_name=None,
+        vote_description=None,
+        shares_on_loan=None,
+        shares_voted=None,
+        vote_category=None,
+        vote_record=None,
+        vote_source=None,
+        how_voted=None,
+        figi=None,
+        management_recommendation=None,
+        accession=None,
+        reporting_owner_cik=None,
+        filing_date=None,
+        # API key handling
+        api_key=None,
+        # Additional options
+        print_cost=True,
+        verbose=False
+    ):
+        """
+        Query the SEC BigQuery API for NPX proxy voting record data.
+        Parameters:
+        -----------
+        columns : List[str], optional
+            Specific columns to return. If None, all columns are returned.
+        # Filter parameters
+        meeting_date, isin, cusip, etc. : Various filters that can be:
+            - str: Exact match
+            - List[str]: Match any in list
+            - tuple: (min, max) range for numeric/date fields
+        shares_on_loan, shares_voted : int/float or tuple
+            Numeric values or (min, max) range
+        filing_date : str or tuple
+            Date string in 'YYYY-MM-DD' format or (start_date, end_date) tuple
+        api_key : str, optional
+            SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
+        print_cost : bool
+            Whether to print the query cost information
+        verbose : bool
+            Whether to print additional information about the query
+        Returns:
+        --------
+        List[Dict]
+            A list of dictionaries containing the query results
+        Raises:
+        -------
+        ValueError
+            If API key is missing or invalid
+        Exception
+            For API errors or other issues
+        """
+        return get_proxy_voting_record(
+            columns=columns,
+            meeting_date=meeting_date,
+            isin=isin,
+            cusip=cusip,
+            issuer_name=issuer_name,
+            vote_description=vote_description,
+            shares_on_loan=shares_on_loan,
+            shares_voted=shares_voted,
+            vote_category=vote_category,
+            vote_record=vote_record,
+            vote_source=vote_source,
+            how_voted=how_voted,
+            figi=figi,
+            management_recommendation=management_recommendation,
+            accession=accession,
+            reporting_owner_cik=reporting_owner_cik,
+            filing_date=filing_date,
+            # API key handling
+            api_key=api_key,
+            # Additional options
+            print_cost=print_cost,
+            verbose=verbose
+        )
+    def download_proxy_voting_record(
+        self,
+        filepath,
+        # Optional filtering parameters
+        columns=None,
+        meeting_date=None,
+        isin=None,
+        cusip=None,
+        issuer_name=None,
+        vote_description=None,
+        shares_on_loan=None,
+        shares_voted=None,
+        vote_category=None,
+        vote_record=None,
+        vote_source=None,
+        how_voted=None,
+        figi=None,
+        management_recommendation=None,
+        accession=None,
+        reporting_owner_cik=None,
+        filing_date=None,
+        # API key handling
+        api_key=None,
+        # Additional options
+        print_cost=True,
+        verbose=False
+    ):
+        """
+        Query the SEC BigQuery API for NPX proxy voting record data and save to CSV.
+        Parameters:
+        -----------
+        filepath : str
+            Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
+        columns : List[str], optional
+            Specific columns to return. If None, all columns are returned.
+        # Filter parameters
+        meeting_date, isin, cusip, etc. : Various filters that can be:
+            - str: Exact match
+            - List[str]: Match any in list
+            - tuple: (min, max) range for numeric/date fields
+        shares_on_loan, shares_voted : int/float or tuple
+            Numeric values or (min, max) range
+        filing_date : str or tuple
+            Date string in 'YYYY-MM-DD' format or (start_date, end_date) tuple
+        api_key : str, optional
+            SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
+        print_cost : bool
+            Whether to print the query cost information
+        verbose : bool
+            Whether to print additional information about the query
+        Returns:
+        --------
+        List[Dict]
+            A list of dictionaries containing the query results
+        Raises:
+        -------
+        ValueError
+            If API key is missing or invalid
+        Exception
+            For API errors or other issues
+        """
+        # Get the data from the API
+        data = self.get_proxy_voting_record(
+            columns=columns,
+            meeting_date=meeting_date,
+            isin=isin,
+            cusip=cusip,
+            issuer_name=issuer_name,
+            vote_description=vote_description,
+            shares_on_loan=shares_on_loan,
+            shares_voted=shares_voted,
+            vote_category=vote_category,
+            vote_record=vote_record,
+            vote_source=vote_source,
+            how_voted=how_voted,
+            figi=figi,
+            management_recommendation=management_recommendation,
+            accession=accession,
+            reporting_owner_cik=reporting_owner_cik,
+            filing_date=filing_date,
+            api_key=api_key,
+            print_cost=print_cost,
+            verbose=verbose
+        )
+        # Save to CSV using the helper method
+        return self._download_to_csv(data, filepath, verbose)

datamule/submission.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from pathlib import Path
 import json
-from .document import Document
+from .document.document import Document
 from secsgml import parse_sgml_submission_into_memory
-from pathlib import Path
+import os
+import aiofiles
 class Submission:
     def __init__(self, path=None,sgml_content=None,keep_document_types=None):
@@ -14,7 +16,12 @@ class Submission:
         if sgml_content is not None:
             self.path = None
             self.metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
+            self.accession = self.metadata['accession-number']
+            self.filing_date= f"{self.metadata['filing-date'][:4]}-{self.metadata['filing-date'][4:6]}-{self.metadata['filing-date'][6:8]}"
             self.documents = []
+            filtered_metadata_documents = []
             for idx,doc in enumerate(self.metadata['documents']):
                 type = doc.get('type')
@@ -24,15 +31,19 @@ class Submission:
                     continue
                 filename = doc.get('filename')
                 extension = Path(filename).suffix
-                self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension))
+                self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
+                filtered_metadata_documents.append(doc)
+            self.metadata['documents'] = filtered_metadata_documents
         if path is not None:
             self.path = Path(path)
             metadata_path = self.path / 'metadata.json'
             with metadata_path.open('r') as f:
                 self.metadata = json.load(f)
     def document_type(self, document_type):
         # Convert single document type to list for consistent handling
@@ -57,7 +68,7 @@ class Submission:
                     with document_path.open('r') as f:
                         content = f.read()
-                    yield Document(type=doc['type'], content=content, extension=extension)
+                    yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
                 # if loaded from sgml_content
                 else:
                     yield self.documents[idx]
@@ -81,7 +92,7 @@ class Submission:
                     with document_path.open('r') as f:
                         content = f.read()
-                    yield Document(type=doc['type'], content=content, extension=extension)
+                    yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
                 else:
                     print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
@@ -110,4 +121,88 @@ class Submission:
                     # delete the file
                     document_path.unlink()
         else:
-            print("Warning: keep() method is only available when loading from path.")
+            print("Warning: keep() method is only available when loading from path.")
+    def save(self, output_dir="filings"):
+        file_dir = Path(output_dir) / str(self.accession)
+        file_dir.mkdir(parents=True, exist_ok=True)
+        metadata_path = file_dir / "metadata.json"
+        with open(metadata_path, 'w') as f:
+            json.dump(self.metadata, f, indent=4)
+        for idx, doc in enumerate(self.metadata['documents']):
+            try:
+                filename = doc.get('filename')
+                if filename is None:
+                    filename = f"{doc.get('sequence', idx)}.txt"
+            except (KeyError, IndexError):
+                filename = f"{idx}.txt"
+            doc_path = file_dir / filename
+            if self.path is not None:
+                if hasattr(self, 'documents') and self.documents:
+                    content = self.documents[idx].content
+                else:
+                    orig_doc_path = self.path / filename
+                    if orig_doc_path.exists():
+                        with open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
+                            content = f.read()
+                    else:
+                        print(f"Warning: File {orig_doc_path} does not exist, skipping.")
+                        continue
+            else:
+                content = self.documents[idx].content
+            if isinstance(content, bytes):
+                with open(doc_path, 'wb') as f:
+                    f.write(content)
+            else:
+                with open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
+                    f.write(content)
+        return file_dir
+    async def save_async(self, output_dir="filings"):
+        file_dir = Path(output_dir) / str(self.accession)
+        os.makedirs(file_dir, exist_ok=True)
+        metadata_path = file_dir / "metadata.json"
+        async with aiofiles.open(metadata_path, 'w') as f:
+            await f.write(json.dumps(self.metadata, indent=4))
+        for idx, doc in enumerate(self.metadata['documents']):
+            try:
+                filename = doc.get('filename')
+                if filename is None:
+                    filename = f"{doc.get('sequence', idx)}.txt"
+            except (KeyError, IndexError):
+                filename = f"{idx}.txt"
+            doc_path = file_dir / filename
+            if self.path is not None:
+                if hasattr(self, 'documents') and self.documents:
+                    content = self.documents[idx].content
+                else:
+                    orig_doc_path = self.path / filename
+                    if orig_doc_path.exists():
+                        async with aiofiles.open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
+                            content = await f.read()
+                    else:
+                        print(f"Warning: File {orig_doc_path} does not exist, skipping.")
+                        continue
+            else:
+                content = self.documents[idx].content
+            if isinstance(content, bytes):
+                async with aiofiles.open(doc_path, 'wb') as f:
+                    await f.write(content)
+            else:
+                async with aiofiles.open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
+                    await f.write(content)
+        return file_dir

datamule 1.1.8__py3-none-any.whl → 1.2.1__py3-none-any.whl

datamule 1.1.8py3-none-any.whl → 1.2.1py3-none-any.whl