PyPI - datamule - Versions diffs - 0.415__cp39-cp39-macosx_10_9_universal2.whl - Mend

datamule 0.415__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datamule might be problematic. Click here for more details.

Files changed (37) hide show

datamule/__init__.py +62 -0
datamule/data/company_former_names.csv +8148 -0
datamule/data/company_metadata.csv +10049 -0
datamule/data/company_tickers.csv +9999 -0
datamule/data/sec-glossary.csv +728 -0
datamule/data/xbrl_descriptions.csv +10024 -0
datamule/dataset_builder/dataset_builder.py +259 -0
datamule/document.py +130 -0
datamule/helper.py +123 -0
datamule/monitor.py +236 -0
datamule/mulebot/__init__.py +1 -0
datamule/mulebot/helper.py +35 -0
datamule/mulebot/mulebot.py +130 -0
datamule/mulebot/mulebot_server/__init__.py +1 -0
datamule/mulebot/mulebot_server/server.py +87 -0
datamule/mulebot/mulebot_server/static/css/minimalist.css +174 -0
datamule/mulebot/mulebot_server/static/scripts/artifacts.js +68 -0
datamule/mulebot/mulebot_server/static/scripts/chat.js +92 -0
datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +56 -0
datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +15 -0
datamule/mulebot/mulebot_server/static/scripts/main.js +57 -0
datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +27 -0
datamule/mulebot/mulebot_server/static/scripts/suggestions.js +47 -0
datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +129 -0
datamule/mulebot/mulebot_server/static/scripts/utils.js +28 -0
datamule/mulebot/mulebot_server/templates/chat-minimalist.html +91 -0
datamule/mulebot/search.py +52 -0
datamule/mulebot/tools.py +82 -0
datamule/packageupdater.py +207 -0
datamule/parser/sgml_parsing/sgml_parser_cy.c +19082 -0
datamule/parser/sgml_parsing/sgml_parser_cy.cpython-39-darwin.so +0 -0
datamule/portfolio.py +16 -0
datamule/submission.py +61 -0
datamule-0.415.dist-info/METADATA +36 -0
datamule-0.415.dist-info/RECORD +37 -0
datamule-0.415.dist-info/WHEEL +5 -0
datamule-0.415.dist-info/top_level.txt +1 -0

datamule/dataset_builder/dataset_builder.py ADDED Viewed

@@ -0,0 +1,259 @@
+import pandas as pd
+import json
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+import google.generativeai as genai
+import time
+import psutil
+from threading import Lock
+class RateLimiter:
+    def __init__(self, max_rpm):
+        self.min_delay = 62.0 / max_rpm # 58 seconds to allow for some buffer WIP
+        self.last_request = time.time()
+        self.lock = Lock()
+        self.request_count = 0
+    def acquire(self):
+        with self.lock:
+            now = time.time()
+            time_since_last = now - self.last_request
+            delay_needed = self.min_delay - time_since_last
+            self.last_request = now + max(0, delay_needed)  # Update based on expected completion
+            self.request_count += 1
+            count = self.request_count
+        # Sleep outside the lock
+        if delay_needed > 0:
+            time.sleep(delay_needed)
+        return count
+class DatasetBuilder:
+    def __init__(self):
+        self.base_prompt = None
+        self.response_schema = None
+        self.input_path = None
+        self.output_path = None
+        self.failed_path = None
+        self.max_rpm = 1450
+        self.max_workers = 30
+        self.save_frequency = 100
+        self.output_columns = None
+        self.buffer = []
+        self.buffer_lock = Lock()
+        self.failed_ids = set()
+        self.failed_lock = Lock()
+        self.model_name = "gemini-1.5-flash-8b"  # Default model
+        self.model_config = {}  # Additional model configuration
+        self.api_key = None
+    def set_api_key(self, api_key):
+        """Set the API key for Google's Generative AI."""
+        self.api_key = api_key
+        genai.configure(api_key=api_key)
+        return self
+    def set_paths(self, input_path, output_path, failed_path):
+        """Set input and output file paths."""
+        self.input_path = input_path
+        self.output_path = output_path
+        self.failed_path = failed_path
+        return self
+    def set_base_prompt(self, prompt):
+        """Set the base prompt for LLM processing."""
+        self.base_prompt = prompt
+        return self
+    def set_response_schema(self, schema):
+        """Set the response schema and derive output columns."""
+        self.response_schema = schema
+        # Derive output columns from schema
+        if schema and 'items' in schema and 'properties' in schema['items']:
+            properties = schema['items']['properties']
+            self.output_columns = ['accession_number'] + list(properties.keys())
+        return self
+    def set_rpm(self, max_rpm=1450):
+        """Set the maximum requests per minute."""
+        self.max_rpm = max_rpm
+        return self
+    def set_max_workers(self, max_workers=30):
+        """Set the maximum number of concurrent workers."""
+        self.max_workers = max_workers
+        return self
+    def set_save_frequency(self, frequency=100):
+        """Set how often to save progress."""
+        self.save_frequency = frequency
+        return self
+    def set_model(self, model_name="gemini-1.5-flash-8b", **model_config):
+        """Set the model name and configuration."""
+        self.model_name = model_name
+        self.model_config = model_config
+        return self
+    def validate_config(self):
+        """Validate that all required configurations are set."""
+        if not all([self.base_prompt, self.response_schema, self.input_path,
+                   self.output_path, self.failed_path, self.api_key]):
+            raise ValueError("""Missing required configuration. Please ensure you have set:
+                           - API key
+                           - Paths (input_path, output_path, failed_path)
+                           - Base prompt
+                           - Response schema""")
+    def get_processed_ids(self):
+        """Get set of processed accession numbers from output file."""
+        if not os.path.exists(self.output_path):
+            return set()
+        try:
+            # Read only the accession_number column for memory efficiency
+            df = pd.read_csv(self.output_path, usecols=['accession_number'])
+            return set(df['accession_number'])
+        except Exception as e:
+            print(f"Warning: Error reading processed IDs: {e}")
+            return set()
+    def save_data(self, df_new):
+        """Append new data to existing CSV."""
+        df_new.to_csv(self.output_path, mode='a', header=not os.path.exists(self.output_path), index=False)
+    def save_failed_ids(self):
+        """Save failed accession numbers to file."""
+        with open(self.failed_path, 'w') as f:
+            for acc in self.failed_ids:
+                f.write(f"{acc}\n")
+    def process_text(self, args):
+        """Process a single text entry through the model."""
+        model, text, accession_number, rate_limiter = args
+        current_requests = rate_limiter.acquire()
+        full_prompt = self.base_prompt + "\n\nINFORMATION:\n" + text
+        try:
+            generation_config = genai.GenerationConfig(
+                response_mime_type="application/json",
+                response_schema=self.response_schema,
+                **self.model_config
+            )
+            response = model.generate_content(
+                full_prompt,
+                generation_config=generation_config
+            )
+            results = json.loads(response.text)
+            for result in results:
+                result['accession_number'] = accession_number
+            with self.buffer_lock:
+                self.buffer.extend(results)
+            return True, current_requests
+        except Exception as e:
+            with self.failed_lock:
+                self.failed_ids.add(accession_number)
+            return False, f"Error processing {accession_number}: {str(e)}"
+    def build(self):
+        """Main processing method to build the dataset."""
+        self.validate_config()
+        # Initialize model and rate limiter
+        model = genai.GenerativeModel(self.model_name)
+        rate_limiter = RateLimiter(self.max_rpm)
+        # Load data
+        print("Loading data...")
+        df_input = pd.read_csv(self.input_path)
+        processed_ids = self.get_processed_ids()
+        df_to_process = df_input[~df_input['accession_number'].isin(processed_ids)]
+        total_in_dataset = len(df_input)
+        already_processed = len(processed_ids)
+        to_process = len(df_to_process)
+        print(f"Total entries in dataset: {total_in_dataset}")
+        print(f"Already processed: {already_processed}")
+        print(f"New entries to process: {to_process}")
+        if len(df_to_process) == 0:
+            print("All entries already processed!")
+            return
+        work_items = [
+            (model, row['text'], row['accession_number'], rate_limiter)
+            for _, row in df_to_process.iterrows()
+        ]
+        start_time = time.time()
+        last_save_time = time.time()
+        processed_count = 0
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = {executor.submit(self.process_text, item): item for item in work_items}
+            with tqdm(total=total_in_dataset, initial=already_processed, desc="Processing entries") as pbar:
+                for future in as_completed(futures):
+                    success, result = future.result()
+                    if not success:
+                        print(f"\n{result}")
+                    processed_count += 1
+                    pbar.update(1)
+                    elapsed = time.time() - start_time
+                    rpm = processed_count / (elapsed / 60)
+                    memory_usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
+                    current_progress = already_processed + processed_count
+                    pbar.set_description(
+                        f"Processed {current_progress}/{total_in_dataset} | {rpm:.0f} RPM | Mem: {memory_usage:.0f}MB"
+                    )
+                    # Save periodically using append
+                    if len(self.buffer) >= self.save_frequency:
+                        with self.buffer_lock:
+                            df_new = pd.DataFrame(self.buffer)
+                            self.buffer = []
+                        if not df_new.empty:
+                            self.save_data(df_new)
+                            last_save_time = time.time()
+                    # Save failed IDs periodically
+                    if self.failed_ids and time.time() - last_save_time > 300:
+                        self.save_failed_ids()
+                        last_save_time = time.time()
+        # Save any remaining results
+        if self.buffer:
+            with self.buffer_lock:
+                df_new = pd.DataFrame(self.buffer)
+                self.buffer = []
+            if not df_new.empty:
+                self.save_data(df_new)
+        if self.failed_ids:
+            self.save_failed_ids()
+        # Print final statistics
+        elapsed = time.time() - start_time
+        final_rpm = processed_count / (elapsed / 60)
+        print(f"\nProcessing complete:")
+        print(f"Total processed in this run: {processed_count}")
+        print(f"Average speed: {final_rpm:.0f} RPM")
+        print(f"Failed entries: {len(self.failed_ids)}")
+        if self.failed_ids:
+            print(f"Failed entries saved to: {self.failed_path}")

datamule/document.py ADDED Viewed

@@ -0,0 +1,130 @@
+import json
+import csv
+from .parser.document_parsing.sec_parser import Parser
+from .helper import convert_to_dashed_accession
+# we need to modify parse filing to take option in memory
+parser = Parser()
+class Document:
+    def __init__(self, type, filename):
+        self.type = type
+        self.filename = filename
+        self.data = None
+    def parse(self):
+        self.data = parser.parse_filing(self.filename, self.type)
+        return self.data
+    def write_json(self, output_filename=None):
+        if not self.data:
+            raise ValueError("No data to write. Parse filing first.")
+        if output_filename is None:
+            output_filename = f"{self.filename.rsplit('.', 1)[0]}.json"
+        with open(output_filename, 'w') as f:
+            json.dump(self.data, f, indent=2)
+    def write_csv(self, output_filename=None, accession_number=None):
+        if self.data is None:
+            raise ValueError("No data available. Please call parse_filing() first.")
+        if output_filename is None:
+            output_filename = f"{self.filename.rsplit('.', 1)[0]}.csv"
+        with open(output_filename, 'w', newline='') as csvfile:
+            if not self.data:
+                return output_filename
+            has_document = any('document' in item for item in self.data)
+            if has_document and 'document' in self.data:
+                writer = csv.DictWriter(csvfile, ['section', 'text'], quoting=csv.QUOTE_ALL)
+                writer.writeheader()
+                flattened = self._flatten_dict(self.data['document'])
+                for section, text in flattened.items():
+                    writer.writerow({'section': section, 'text': text})
+            else:
+                fieldnames = list(self.data[0].keys())
+                if accession_number:
+                    fieldnames.append('Accession Number')
+                writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
+                writer.writeheader()
+                for row in self.data:
+                    if accession_number:
+                        row['Accession Number'] = convert_to_dashed_accession(accession_number)
+                    writer.writerow(row)
+        return output_filename
+    def _document_to_section_text(self, document_data, parent_key=''):
+        items = []
+        if isinstance(document_data, dict):
+            for key, value in document_data.items():
+                # Build the section name
+                section = f"{parent_key}_{key}" if parent_key else key
+                # If the value is a dict, recurse
+                if isinstance(value, dict):
+                    items.extend(self._document_to_section_text(value, section))
+                # If it's a list, handle each item
+                elif isinstance(value, list):
+                    for i, item in enumerate(value):
+                        if isinstance(item, dict):
+                            items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
+                        else:
+                            items.append({
+                                'section': f"{section}_{i+1}",
+                                'text': str(item)
+                            })
+                # Base case - add the item
+                else:
+                    items.append({
+                        'section': section,
+                        'text': str(value)
+                    })
+        return items
+    def _flatten_dict(self, d, parent_key=''):
+        items = {}
+        if isinstance(d, list):
+            return [self._flatten_dict(item) for item in d]
+        for k, v in d.items():
+            new_key = f"{parent_key}_{k}" if parent_key else k
+            if isinstance(v, dict):
+                items.update(self._flatten_dict(v, new_key))
+            else:
+                items[new_key] = str(v)
+        return items
+    def __iter__(self):
+        if not self.data:
+            self.parse()
+        if self.type == 'INFORMATION TABLE':
+            return iter(self.data)
+        elif self.type == '8-K':
+            return iter(self._document_to_section_text(self.data['document']))
+        elif self.type == '10-K':
+            return iter(self._document_to_section_text(self.data['document']))
+        elif self.type == '10-Q':
+            return iter(self._document_to_section_text(self.data['document']))
+        elif self.type in ['3', '4', '5']:
+            return iter(self._flatten_dict(self.data['holdings']))
+        elif self.type == 'D':
+            return iter(self._flatten_dict(self.data['document']['relatedPersonsList']['relatedPersonInfo']))
+        elif self.type == 'NPORT-P':
+            return iter(self._flatten_dict(self.data['document']['formData']['invstOrSecs']['invstOrSec']))
+        elif self.type == 'SC 13D':
+            return iter(self._document_to_section_text(self.data['document']))
+        elif self.type == 'SC 13G':
+            return iter(self._document_to_section_text(self.data['document']))

datamule/helper.py ADDED Viewed

@@ -0,0 +1,123 @@
+import requests
+import os
+from tqdm import tqdm
+import zipfile
+from pkg_resources import resource_filename
+import csv
+import re
+# Unused in current implementation.
+def construct_primary_doc_url(cik, accession_number,primary_doc_url):
+    accession_number = accession_number.replace("-", "")
+    return f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{primary_doc_url}"
+# DONE
+def _download_from_dropbox(url, output_path):
+    headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
+    r = requests.get(url, stream=True, headers=headers)
+    total_size = int(r.headers.get('content-length', 0))
+    with open(output_path, 'wb') as f, tqdm(
+        desc="Downloading " + os.path.basename(output_path),
+        total=total_size,
+        unit='iB',
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as progress_bar:
+        for chunk in r.iter_content(chunk_size=1024):
+            size = f.write(chunk)
+            progress_bar.update(size)
+    # Check if the downloaded file is a zip file
+    if zipfile.is_zipfile(output_path):
+        extract_path = os.path.dirname(output_path)
+        with zipfile.ZipFile(output_path, 'r') as zip_ref:
+            for file_info in zip_ref.infolist():
+                extract_file_path = os.path.join(extract_path, file_info.filename)
+                with zip_ref.open(file_info) as file_in_zip, \
+                    open(extract_file_path, 'wb') as output_file, \
+                    tqdm(total=file_info.file_size, unit='B', unit_scale=True,
+                         desc=f"Extracting {file_info.filename}") as pbar:
+                    while True:
+                        chunk = file_in_zip.read(8192)
+                        if not chunk:
+                            break
+                        output_file.write(chunk)
+                        pbar.update(len(chunk))
+        # Remove the zip file after extraction
+        os.remove(output_path)
+        print(f"Extracted contents to {extract_path}")
+    else:
+        print(f"Downloaded file is not a zip. Saved to {output_path}")
+# May generalize to load any package resource
+def load_package_csv(name):
+    """Load package CSV files"""
+    csv_path = resource_filename('datamule', f'data/{name}.csv')
+    company_tickers = []
+    with open(csv_path, 'r') as csvfile:
+        csv_reader = csv.DictReader(csvfile)
+        for row in csv_reader:
+            company_tickers.append(row)
+    return company_tickers
+def load_package_dataset(dataset):
+    if dataset == 'company_tickers':
+        return load_package_csv('company_tickers')
+    elif dataset =='company_former_names':
+        return load_package_csv('company_former_names')
+    elif dataset =='company_metadata':
+        return load_package_csv('company_metadata')
+    elif dataset == 'sec_glossary':
+        return load_package_csv('sec-glossary')
+    elif dataset == 'xbrl_descriptions':
+        return load_package_csv('xbrl_descriptions')
+# DONE
+def identifier_to_cik(ticker):
+    """Convert company tickers to CIK codes"""
+    company_tickers = load_package_csv('company_tickers')
+    if ticker:
+        if isinstance(ticker, list):
+            cik = []
+            for t in ticker:
+                cik.extend([company['cik'] for company in company_tickers if t == company['ticker']])
+        else:
+            cik = [company['cik'] for company in company_tickers if ticker == company['ticker']]
+    if not cik:
+        raise ValueError("No matching companies found")
+    return cik
+def fix_filing_url(url):
+    match_suffix = re.search(r'/(\d{4})\.(.+?)$', url)
+    if match_suffix:
+        suffix_number = match_suffix.group(1)
+        file_ext = match_suffix.group(2)
+        match_accession = re.search(r'/(\d{18})/', url)
+        if match_accession:
+            accession_number = match_accession.group(1)
+            formatted_accession_number = f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
+            new_url = url.rsplit('/', 1)[0] + f'/{formatted_accession_number}-{suffix_number}.{file_ext}'
+            return new_url
+    return url
+def convert_to_dashed_accession(accession):
+    # Remove any existing dashes or whitespace
+    cleaned = ''.join(accession.split())
+    # Check if the cleaned string has 18 characters
+    if len(cleaned) != 18:
+        raise ValueError("Invalid accession number format. Expected 18 characters.")
+    # Insert dashes at the correct positions
+    dashed = f"{cleaned[:10]}-{cleaned[10:12]}-{cleaned[12:]}"
+    return dashed
+headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}