PyPI - datamule - Versions diffs - 1.0.3__py3-none-any.whl → 1.0.6__py3-none-any.whl - Mend

datamule 1.0.3py3-none-any.whl → 1.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

datamule/__init__.py +2 -13
datamule/document.py +0 -1
datamule/helper.py +85 -105
datamule/portfolio.py +105 -29
datamule/submission.py +0 -38
{datamule-1.0.3.dist-info → datamule-1.0.6.dist-info}/METADATA +2 -8
datamule-1.0.6.dist-info/RECORD +10 -0
datamule/book/__init__.py +0 -0
datamule/book/book.py +0 -34
datamule/book/eftsquery.py +0 -127
datamule/book/xbrl_retriever.py +0 -88
datamule/data/company_former_names.csv +0 -8148
datamule/data/company_metadata.csv +0 -10049
datamule/data/company_tickers.csv +0 -9999
datamule/data/sec-glossary.csv +0 -728
datamule/data/xbrl_descriptions.csv +0 -10024
datamule/downloader/downloader.py +0 -374
datamule/downloader/premiumdownloader.py +0 -335
datamule/mapping_dicts/txt_mapping_dicts.py +0 -234
datamule/mapping_dicts/xml_mapping_dicts.py +0 -19
datamule/monitor.py +0 -283
datamule/mulebot/__init__.py +0 -1
datamule/mulebot/helper.py +0 -35
datamule/mulebot/mulebot.py +0 -130
datamule/mulebot/mulebot_server/__init__.py +0 -1
datamule/mulebot/mulebot_server/server.py +0 -87
datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -174
datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -68
datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -92
datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -56
datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -15
datamule/mulebot/mulebot_server/static/scripts/main.js +0 -57
datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -27
datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -47
datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -129
datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -28
datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -91
datamule/mulebot/search.py +0 -52
datamule/mulebot/tools.py +0 -82
datamule/packageupdater.py +0 -207
datamule-1.0.3.dist-info/RECORD +0 -43
{datamule-1.0.3.dist-info → datamule-1.0.6.dist-info}/WHEEL +0 -0
{datamule-1.0.3.dist-info → datamule-1.0.6.dist-info}/top_level.txt +0 -0

datamule/monitor.py DELETED Viewed

@@ -1,283 +0,0 @@
-import asyncio
-import aiohttp
-from datetime import timedelta, datetime
-import pytz
-from collections import deque
-import time
-from .helper import headers, identifier_to_cik
-def _get_current_eastern_date():
-    """Get current date in US Eastern timezone (automatically handles DST) """
-    eastern = pytz.timezone('America/New_York')
-    return datetime.now(eastern)
-def _parse_date(date_str):
-    """Parse YYYY-MM-DD date string to datetime object in Eastern timezone"""
-    try:
-        date = datetime.strptime(date_str, '%Y-%m-%d')
-        eastern = pytz.timezone('America/New_York')
-        return eastern.localize(date)
-    except ValueError as e:
-        raise ValueError(f"Invalid date format. Please use YYYY-MM-DD. Error: {str(e)}")
-class PreciseRateLimiter:
-    def __init__(self, rate, interval=1.0):
-        self.rate = rate  # requests per interval
-        self.interval = interval  # in seconds
-        self.token_time = self.interval / self.rate  # time per token
-        self.last_time = time.time()
-        self.lock = asyncio.Lock()
-    async def acquire(self):
-        async with self.lock:
-            now = time.time()
-            wait_time = self.last_time + self.token_time - now
-            if wait_time > 0:
-                await asyncio.sleep(wait_time)
-            self.last_time = time.time()
-            return True
-    async def __aenter__(self):
-        await self.acquire()
-        return self
-    async def __aexit__(self, exc_type, exc, tb):
-        pass
-class RateMonitor:
-    def __init__(self, window_size=1.0):
-        self.window_size = window_size
-        self.requests = deque()
-        self._lock = asyncio.Lock()
-    async def add_request(self, size_bytes):
-        async with self._lock:
-            now = time.time()
-            self.requests.append((now, size_bytes))
-            while self.requests and self.requests[0][0] < now - self.window_size:
-                self.requests.popleft()
-    def get_current_rates(self):
-        now = time.time()
-        while self.requests and self.requests[0][0] < now - self.window_size:
-            self.requests.popleft()
-        if not self.requests:
-            return 0, 0
-        request_count = len(self.requests)
-        byte_count = sum(size for _, size in self.requests)
-        requests_per_second = request_count / self.window_size
-        mb_per_second = (byte_count / 1024 / 1024) / self.window_size
-        return round(requests_per_second, 1), round(mb_per_second, 2)
-class Monitor:
-    def __init__(self):
-        self.last_total = 0
-        self.last_date = None
-        self.current_monitor_date = None
-        self.submissions = []
-        self.max_hits = 10000
-        self.limiter = PreciseRateLimiter(5)  # 5 requests per second
-        self.rate_monitor = RateMonitor()
-        self.headers = headers
-    async def _fetch_json(self, session, url):
-        """Fetch JSON with rate limiting and monitoring."""
-        async with self.limiter:
-            try:
-                async with session.get(url) as response:
-                    response.raise_for_status()
-                    content = await response.read()
-                    await self.rate_monitor.add_request(len(content))
-                    return await response.json()
-            except Exception as e:
-                print(f"Error fetching {url}: {str(e)}")
-                return None
-    async def _poll(self, base_url, session, poll_interval, quiet):
-        """Poll API until new submissions are found."""
-        while True:
-            current_date = _get_current_eastern_date()
-            # If we're caught up to current date, use it, otherwise use our tracking date
-            if self.current_monitor_date.date() >= current_date.date():
-                self.current_monitor_date = current_date
-            else:
-                # If we're behind current date and haven't finished current date's processing,
-                # continue with current date
-                if self.last_date == self.current_monitor_date.strftime('%Y-%m-%d'):
-                    pass
-                else:
-                    # Move to next day
-                    self.current_monitor_date += timedelta(days=1)
-            date_str = self.current_monitor_date.strftime('%Y-%m-%d')
-            timestamp = int(time.time())
-            if self.last_date != date_str:
-                print(f"Processing date: {date_str}")
-                self.last_total = 0
-                self.submissions = []
-                self.last_date = date_str
-            poll_url = f"{base_url}&startdt={date_str}&enddt={date_str}&v={timestamp}"
-            if not quiet:
-                print(f"Polling {poll_url}")
-            try:
-                data = await self._fetch_json(session, poll_url)
-                if data:
-                    current_total = data['hits']['total']['value']
-                    if current_total > self.last_total:
-                        print(f"Found {current_total - self.last_total} new submissions for {date_str}")
-                        self.last_total = current_total
-                        return current_total, data, poll_url
-                    self.last_total = current_total
-                    # If we have no hits and we're processing a past date,
-                    # we can move to the next day immediately
-                    if current_total == 0 and self.current_monitor_date.date() < current_date.date():
-                        continue
-            except Exception as e:
-                print(f"Error in poll: {str(e)}")
-            await asyncio.sleep(poll_interval / 1000)
-    async def _retrieve_batch(self, session, poll_url, from_positions, quiet):
-        """Retrieve a batch of submissions concurrently."""
-        tasks = [
-            self._fetch_json(
-                session,
-                f"{poll_url}&from={pos}"
-            )
-            for pos in from_positions
-        ]
-        results = await asyncio.gather(*tasks, return_exceptions=True)
-        submissions = []
-        for result in results:
-            if isinstance(result, Exception):
-                print(f"Error in batch: {str(result)}")
-                continue
-            if result and 'hits' in result:
-                submissions.extend(result['hits']['hits'])
-        return submissions
-    async def _retrieve(self, poll_url, initial_data, session, quiet):
-        """Retrieve all submissions using parallel batch processing."""
-        batch_size = 10  # Number of concurrent requests
-        page_size = 100  # Results per request
-        max_position = min(self.max_hits, self.last_total)
-        submissions = []
-        # Process in batches of concurrent requests
-        for batch_start in range(0, max_position, batch_size * page_size):
-            from_positions = [
-                pos for pos in range(
-                    batch_start,
-                    min(batch_start + batch_size * page_size, max_position),
-                    page_size
-                )
-            ]
-            if not quiet:
-                print(f"Retrieving batch from positions: {from_positions}")
-            batch_submissions = await self._retrieve_batch(
-                session, poll_url, from_positions, quiet
-            )
-            if not batch_submissions:
-                break
-            submissions.extend(batch_submissions)
-            # If we got fewer results than expected, we're done
-            if len(batch_submissions) < len(from_positions) * page_size:
-                break
-        return submissions
-    async def _monitor(self, callback, form=None, cik=None, ticker=None, start_date=None, poll_interval=1000, quiet=True):
-        """Main monitoring loop with parallel processing."""
-        if poll_interval < 100:
-            raise ValueError("SEC rate limit is 10 requests per second, set poll_interval to 100ms or higher")
-        # Set up initial monitoring date
-        if start_date:
-            self.current_monitor_date = _parse_date(start_date)
-        else:
-            self.current_monitor_date = _get_current_eastern_date()
-        # Handle form parameter
-        if form is None:
-            form = ['-0']
-        elif isinstance(form, str):
-            form = [form]
-        # Handle CIK/ticker parameter
-        cik_param = None
-        if ticker is not None:
-            cik_param = identifier_to_cik(ticker)
-        elif cik is not None:
-            cik_param = cik if isinstance(cik, list) else [cik]
-        # Construct base URL
-        base_url = 'https://efts.sec.gov/LATEST/search-index?forms=' + ','.join(form)
-        # Add CIK parameter if specified
-        if cik_param:
-            cik_list = ','.join(str(c).zfill(10) for c in cik_param)
-            base_url += f"&ciks={cik_list}"
-        async with aiohttp.ClientSession(headers=self.headers) as session:
-            while True:
-                try:
-                    # Poll until we find new submissions
-                    _, data, poll_url = await self._poll(base_url, session, poll_interval, quiet)
-                    # Retrieve all submissions in parallel
-                    submissions = await self._retrieve(poll_url, data, session, quiet)
-                    # Find new submissions
-                    existing_ids = {sub['_id'] for sub in self.submissions}
-                    new_submissions = [
-                        sub for sub in submissions
-                        if sub['_id'] not in existing_ids
-                    ]
-                    if new_submissions:
-                        self.submissions.extend(new_submissions)
-                        if callback:
-                            await callback(new_submissions)
-                        reqs_per_sec, mb_per_sec = self.rate_monitor.get_current_rates()
-                        if not quiet:
-                            print(f"Current rates: {reqs_per_sec} req/s, {mb_per_sec} MB/s")
-                except Exception as e:
-                    print(f"Error in monitor: {str(e)}")
-                    await asyncio.sleep(poll_interval / 1000)
-                await asyncio.sleep(poll_interval / 1000)
-    def monitor_submissions(self, callback=None, form=None, cik=None, ticker=None, start_date=None, poll_interval=1000, quiet=True):
-        """
-        Start the monitoring process.
-        Parameters:
-            callback (callable, optional): Function to call when new submissions are found
-            form (str or list, optional): Form type(s) to monitor
-            cik (str or list, optional): CIK(s) to monitor
-            ticker (str, optional): Ticker symbol to monitor
-            start_date (str, optional): Start date in YYYY-MM-DD format
-            poll_interval (int, optional): Polling interval in milliseconds
-            quiet (bool, optional): Suppress verbose output
-        """
-        asyncio.run(self._monitor(callback, form, cik, ticker, start_date, poll_interval, quiet))

datamule/mulebot/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .mulebot import MuleBot

datamule/mulebot/helper.py DELETED Viewed

@@ -1,35 +0,0 @@
-import requests
-from datamule.global_vars import headers
-from datamule.helper import identifier_to_cik
-from datamule import Parser
-parser = Parser()
-def get_company_concept(ticker):
-    cik = identifier_to_cik(ticker)[0]
-    url = f'https://data.sec.gov/api/xbrl/companyfacts/CIK{str(cik).zfill(10)}.json'
-    response = requests.get(url,headers=headers)
-    data = response.json()
-    table_dict_list = parser.parse_company_concepts(data)
-    # drop tables where label is None
-    table_dict_list = [table_dict for table_dict in table_dict_list if table_dict['label'] is not None]
-    return table_dict_list
-def select_dict_by_title(data, title):
-    if isinstance(data, dict):
-        if data.get('title') == title:
-            return data
-        for value in data.values():
-            result = select_dict_by_title(value, title)
-            if result:
-                return result
-    elif isinstance(data, list):
-        for item in data:
-            result = select_dict_by_title(item, title)
-            if result:
-                return result
-    return None

datamule/mulebot/mulebot.py DELETED Viewed

@@ -1,130 +0,0 @@
-import openai
-import json
-from datamule.helper import identifier_to_cik
-from datamule import Downloader, Parser
-from .search import search_filing
-from .tools import tools, return_title_tool
-from .helper import get_company_concept, select_dict_by_title
-downloader = Downloader()
-parser = Parser()
-class MuleBot:
-    def __init__(self, api_key):
-        self.client = openai.OpenAI(api_key=api_key)
-        self.messages = [
-            {"role": "system", "content": "You are a helpful, but concise, assistant to assist with questions related to the Securities and Exchanges Commission. You are allowed to guess tickers."}
-        ]
-        self.total_tokens = 0
-    def process_message(self, user_input):
-        new_message_chain = self.messages
-        new_message_chain.append({"role": "user", "content": user_input})
-        try:
-            response = self.client.chat.completions.create(
-                model="gpt-4o-mini",
-                messages=new_message_chain,
-                tools=tools,
-                tool_choice="auto"
-            )
-            self.total_tokens += response.usage.total_tokens
-            assistant_message = response.choices[0].message
-            if assistant_message.content is None:
-                assistant_message.content = "I'm processing your request."
-            new_message_chain.append({"role": "assistant", "content": assistant_message.content})
-            tool_calls = assistant_message.tool_calls
-            if tool_calls is None:
-                return {'key':'text','value':assistant_message.content}
-            else:
-                for tool_call in tool_calls:
-                    print(f"Tool call: {tool_call.function.name}")
-                    if tool_call.function.name == "identifier_to_cik":
-                        function_args = json.loads(tool_call.function.arguments)
-                        print(f"Function args: {function_args}")
-                        cik = identifier_to_cik(function_args["ticker"])
-                        return {'key':'text','value':cik}
-                    elif tool_call.function.name == "get_company_concept":
-                        function_args = json.loads(tool_call.function.arguments)
-                        print(f"Function args: {function_args}")
-                        table_dict_list = get_company_concept(function_args["ticker"])
-                        return {'key':'table','value':table_dict_list}
-                    elif tool_call.function.name == "get_filing_urls":
-                        function_args = json.loads(tool_call.function.arguments)
-                        print(f"Function args: {function_args}")
-                        result = downloader.download(**function_args,return_urls=True)
-                        return {'key':'list','value':result}
-                    elif tool_call.function.name == "find_filing_section_by_title":
-                        function_args = json.loads(tool_call.function.arguments)
-                        print(f"Function args: {function_args}")
-                        # Parse the filing
-                        data = parser.parse_filing(function_args["url"])
-                        # find possible matches
-                        section_dicts = search_filing(query = function_args["title"], nested_dict =data, score_cutoff=0.3)
-                        # feed titles back to assistant
-                        titles = [section['title'] for section in section_dicts]
-                        new_message_chain.append({"role": "assistant", "content": f"Which of these titles is closest: {','.join(titles)}"})
-                        title_response = self.client.chat.completions.create(
-                            model="gpt-4o-mini",
-                            messages=new_message_chain,
-                            tools=[return_title_tool],
-                            tool_choice="required"
-                        )
-                        title_tool_call = title_response.choices[0].message.tool_calls[0]
-                        title = json.loads(title_tool_call.function.arguments)['title']
-                        print(f"Selected title: {title}")
-                        #print(f"Possible titles: {titles}")
-                        # select the section
-                        #section_dict = select_dict_by_title(data, title)
-                        # probably want to return full dict, and section label
-                        return {'key':'filing','value':{'data':data,'title':title}}
-            return {'key':'text','value':'No tool call was made.'}
-        except Exception as e:
-            return f"An error occurred: {str(e)}"
-    def get_total_tokens(self):
-        return self.total_tokens
-    def run(self):
-        """Basic chatbot loop"""
-        print("MuleBot: Hello! I'm here to assist you with questions related to the Securities and Exchange Commission. Type 'quit', 'exit', or 'bye' to end the conversation.")
-        while True:
-            user_input = input("You: ")
-            if user_input.lower() in ['quit', 'exit', 'bye']:
-                print("MuleBot: Goodbye!")
-                break
-            response = self.process_message(user_input)
-            response_type = response['key']
-            if response_type == 'text':
-                value = response['value']
-                print(value)
-            elif response_type == 'table':
-                value = response['value']
-                print(value)
-            elif response_type == 'list':
-                value = response['value']
-                print(value)
-            elif response_type == 'filing':
-                value = response['value']
-                print(value)
-            else:
-                value = response['value']
-                print(value)

datamule/mulebot/mulebot_server/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .server import MuleBotServer

datamule/mulebot/mulebot_server/server.py DELETED Viewed

@@ -1,87 +0,0 @@
-import os
-from flask import Flask, request, jsonify, render_template
-from datamule.mulebot import MuleBot
-from datamule.filing_viewer import create_interactive_filing, create_valid_id
-class MuleBotServer:
-    def __init__(self, template='chat-minimalist.html'):
-        template_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'templates'))
-        static_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'static'))
-        self.app = Flask(__name__, template_folder=template_dir, static_folder=static_dir)
-        self.mulebot = None
-        self.template = template
-        self.setup_routes()
-    def setup_routes(self):
-        @self.app.route('/')
-        def home():
-            return render_template(self.template)
-        @self.app.route('/chat-with-prompt')
-        def chat_with_prompt():
-            prefilled_prompt = request.args.get('prompt', '')
-            return render_template(self.template, prefilled_prompt=prefilled_prompt)
-        @self.app.route('/chat', methods=['POST'])
-        def chat():
-            user_input = request.json['message']
-            # Process the message using MuleBot's process_message method
-            response = self.mulebot.process_message(user_input)
-            response_type = response['key']
-            # Prepare the response based on the type
-            if response_type == 'text':
-                # If response type is text, add it to the chat
-                chat_response = {
-                    'type': 'text',
-                    'content': response['value']
-                }
-            elif response_type == 'table':
-                # If response type is table, prepare it for the artifact window
-                chat_response = {
-                    'type': 'artifact',
-                    'content': response['value'],
-                    'artifact_type': 'artifact-table'
-                }
-            elif response_type == 'list':
-                chat_response = {
-                    'type': 'artifact',
-                    'content': response['value'],
-                    'artifact_type': 'artifact-list'
-                }
-            elif response_type == 'filing':
-                data = response['value']['data']
-                title = response['value']['title']
-                section_id = create_valid_id(title)
-                # create a filing viewer display
-                html = create_interactive_filing(data)
-                # we'll need to display the filing viewer in the artifact window, with a json export option
-                chat_response = {
-                    'type': 'artifact',
-                    'content': html,
-                    'data': data,
-                    'section_id': section_id,
-                    'artifact_type': 'artifact-filing'
-                }
-            else:
-                # Handle other types of responses if needed
-                chat_response = {
-                    'type': 'unknown',
-                    'content': 'Unsupported response type'
-                }
-            return jsonify({
-                'response': chat_response,
-                'total_tokens': self.mulebot.get_total_tokens()
-            })
-    def set_api_key(self, api_key):
-        self.mulebot = MuleBot(api_key)
-    def run(self, debug=False, host='0.0.0.0', port=5000):
-        if not self.mulebot:
-            raise ValueError("API key not set. Please call set_api_key() before running the server.")
-        self.app.run(debug=debug, host=host, port=port)

datamule 1.0.3__py3-none-any.whl → 1.0.6__py3-none-any.whl

datamule 1.0.3py3-none-any.whl → 1.0.6py3-none-any.whl