datamule 1.0.3__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +2 -13
- datamule/document.py +8 -9
- datamule/helper.py +85 -105
- datamule/portfolio.py +105 -29
- datamule/submission.py +0 -38
- {datamule-1.0.3.dist-info → datamule-1.0.7.dist-info}/METADATA +2 -8
- datamule-1.0.7.dist-info/RECORD +10 -0
- datamule/book/__init__.py +0 -0
- datamule/book/book.py +0 -34
- datamule/book/eftsquery.py +0 -127
- datamule/book/xbrl_retriever.py +0 -88
- datamule/data/company_former_names.csv +0 -8148
- datamule/data/company_metadata.csv +0 -10049
- datamule/data/company_tickers.csv +0 -9999
- datamule/data/sec-glossary.csv +0 -728
- datamule/data/xbrl_descriptions.csv +0 -10024
- datamule/downloader/downloader.py +0 -374
- datamule/downloader/premiumdownloader.py +0 -335
- datamule/mapping_dicts/txt_mapping_dicts.py +0 -234
- datamule/mapping_dicts/xml_mapping_dicts.py +0 -19
- datamule/monitor.py +0 -283
- datamule/mulebot/__init__.py +0 -1
- datamule/mulebot/helper.py +0 -35
- datamule/mulebot/mulebot.py +0 -130
- datamule/mulebot/mulebot_server/__init__.py +0 -1
- datamule/mulebot/mulebot_server/server.py +0 -87
- datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -174
- datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -68
- datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -92
- datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -56
- datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -15
- datamule/mulebot/mulebot_server/static/scripts/main.js +0 -57
- datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -27
- datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -47
- datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -129
- datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -28
- datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -91
- datamule/mulebot/search.py +0 -52
- datamule/mulebot/tools.py +0 -82
- datamule/packageupdater.py +0 -207
- datamule-1.0.3.dist-info/RECORD +0 -43
- {datamule-1.0.3.dist-info → datamule-1.0.7.dist-info}/WHEEL +0 -0
- {datamule-1.0.3.dist-info → datamule-1.0.7.dist-info}/top_level.txt +0 -0
datamule/book/eftsquery.py
DELETED
@@ -1,127 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
import aiohttp
|
3
|
-
from tqdm import tqdm
|
4
|
-
from datetime import datetime
|
5
|
-
from urllib.parse import urlencode
|
6
|
-
import time
|
7
|
-
|
8
|
-
class PreciseRateLimiter:
|
9
|
-
def __init__(self, rate=10, interval=1.0):
|
10
|
-
self.rate = rate # requests per interval
|
11
|
-
self.interval = interval # in seconds
|
12
|
-
self.token_time = self.interval / self.rate # time per token
|
13
|
-
self.last_time = time.time()
|
14
|
-
self.lock = asyncio.Lock()
|
15
|
-
|
16
|
-
async def acquire(self):
|
17
|
-
async with self.lock:
|
18
|
-
now = time.time()
|
19
|
-
wait_time = self.last_time + self.token_time - now
|
20
|
-
if wait_time > 0:
|
21
|
-
await asyncio.sleep(wait_time)
|
22
|
-
self.last_time = time.time()
|
23
|
-
return True
|
24
|
-
|
25
|
-
class EFTSQuery:
|
26
|
-
def __init__(self):
|
27
|
-
self.headers = {
|
28
|
-
'User-Agent': 'Your Name yourname@email.com',
|
29
|
-
'Accept-Encoding': 'gzip, deflate',
|
30
|
-
'Host': 'efts.sec.gov'
|
31
|
-
}
|
32
|
-
self.session = None
|
33
|
-
self.limiter = PreciseRateLimiter(10)
|
34
|
-
|
35
|
-
async def __aenter__(self):
|
36
|
-
if not self.session:
|
37
|
-
self.session = aiohttp.ClientSession(headers=self.headers)
|
38
|
-
return self
|
39
|
-
|
40
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
41
|
-
if self.session:
|
42
|
-
await self.session.close()
|
43
|
-
self.session = None
|
44
|
-
|
45
|
-
async def _fetch_json(self, url):
|
46
|
-
await self.limiter.acquire()
|
47
|
-
try:
|
48
|
-
async with self.session.get(url) as response:
|
49
|
-
if response.status == 429:
|
50
|
-
await asyncio.sleep(61)
|
51
|
-
return await self._fetch_json(url)
|
52
|
-
return await response.json()
|
53
|
-
except Exception as e:
|
54
|
-
print(f"Error fetching {url}: {str(e)}")
|
55
|
-
return None
|
56
|
-
|
57
|
-
async def _get_accession_numbers(self, base_url):
|
58
|
-
data = await self._fetch_json(f"{base_url}&from=0&size=1")
|
59
|
-
if not data or 'hits' not in data:
|
60
|
-
return []
|
61
|
-
|
62
|
-
total_hits = data['hits']['total']['value']
|
63
|
-
if not total_hits:
|
64
|
-
return []
|
65
|
-
|
66
|
-
accession_numbers = []
|
67
|
-
start = 0
|
68
|
-
page_size = 100
|
69
|
-
batch_size = 10 # Number of concurrent requests
|
70
|
-
|
71
|
-
with tqdm(total=total_hits) as pbar:
|
72
|
-
while start < total_hits:
|
73
|
-
tasks = []
|
74
|
-
for i in range(batch_size):
|
75
|
-
if start + i * page_size >= total_hits:
|
76
|
-
break
|
77
|
-
url = f"{base_url}&from={start + i * page_size}&size={page_size}"
|
78
|
-
tasks.append(self._fetch_json(url))
|
79
|
-
|
80
|
-
if not tasks:
|
81
|
-
break
|
82
|
-
|
83
|
-
results = await asyncio.gather(*tasks)
|
84
|
-
|
85
|
-
for data in results:
|
86
|
-
if data and 'hits' in data:
|
87
|
-
hits = data['hits']['hits']
|
88
|
-
batch_numbers = [
|
89
|
-
f"{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}"
|
90
|
-
for hit in hits
|
91
|
-
]
|
92
|
-
accession_numbers.extend(batch_numbers)
|
93
|
-
pbar.update(len(hits))
|
94
|
-
|
95
|
-
start += batch_size * page_size
|
96
|
-
|
97
|
-
return accession_numbers
|
98
|
-
|
99
|
-
def query_efts(self, cik=None, ticker=None, submission_type=None, filing_date=None, search_text=None):
|
100
|
-
async def _download():
|
101
|
-
async with self as downloader:
|
102
|
-
params = {}
|
103
|
-
|
104
|
-
if cik:
|
105
|
-
params['ciks'] = str(cik).zfill(10)
|
106
|
-
|
107
|
-
if submission_type:
|
108
|
-
params['forms'] = ','.join(submission_type) if isinstance(submission_type, list) else submission_type
|
109
|
-
|
110
|
-
if isinstance(filing_date, list):
|
111
|
-
dates = [(d, d) for d in filing_date]
|
112
|
-
elif isinstance(filing_date, tuple):
|
113
|
-
dates = [filing_date]
|
114
|
-
else:
|
115
|
-
date_str = filing_date if filing_date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
|
116
|
-
start, end = date_str.split(',')
|
117
|
-
dates = [(start, end)]
|
118
|
-
|
119
|
-
params['startdt'], params['enddt'] = dates[0]
|
120
|
-
|
121
|
-
if search_text:
|
122
|
-
params['q'] = f'"{search_text}"'
|
123
|
-
|
124
|
-
base_url = f"https://efts.sec.gov/LATEST/search-index?{urlencode(params, doseq=True)}"
|
125
|
-
return await self._get_accession_numbers(base_url)
|
126
|
-
|
127
|
-
return asyncio.run(_download())
|
datamule/book/xbrl_retriever.py
DELETED
@@ -1,88 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
import aiohttp
|
3
|
-
import time
|
4
|
-
|
5
|
-
class PreciseRateLimiter:
|
6
|
-
def __init__(self, rate=10, interval=1.0):
|
7
|
-
self.rate = rate
|
8
|
-
self.interval = interval
|
9
|
-
self.token_time = self.interval / self.rate
|
10
|
-
self.last_time = time.time()
|
11
|
-
self.lock = asyncio.Lock()
|
12
|
-
|
13
|
-
async def acquire(self):
|
14
|
-
async with self.lock:
|
15
|
-
now = time.time()
|
16
|
-
wait_time = self.last_time + self.token_time - now
|
17
|
-
if wait_time > 0:
|
18
|
-
await asyncio.sleep(wait_time)
|
19
|
-
self.last_time = time.time()
|
20
|
-
return True
|
21
|
-
|
22
|
-
class XBRLRetriever:
|
23
|
-
def __init__(self):
|
24
|
-
self.base_url = "https://data.sec.gov/api/xbrl/frames"
|
25
|
-
self.headers = {
|
26
|
-
'User-Agent': 'Your Name yourname@email.com',
|
27
|
-
'Accept-Encoding': 'gzip, deflate',
|
28
|
-
'Host': 'data.sec.gov'
|
29
|
-
}
|
30
|
-
self.session = None
|
31
|
-
self.limiter = PreciseRateLimiter(10)
|
32
|
-
|
33
|
-
async def __aenter__(self):
|
34
|
-
if not self.session:
|
35
|
-
self.session = aiohttp.ClientSession(headers=self.headers)
|
36
|
-
return self
|
37
|
-
|
38
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
39
|
-
if self.session:
|
40
|
-
await self.session.close()
|
41
|
-
self.session = None
|
42
|
-
|
43
|
-
async def _fetch_json(self, url):
|
44
|
-
await self.limiter.acquire()
|
45
|
-
try:
|
46
|
-
async with self.session.get(url) as response:
|
47
|
-
if response.status == 429:
|
48
|
-
await asyncio.sleep(61)
|
49
|
-
return await self._fetch_json(url)
|
50
|
-
elif response.status == 200:
|
51
|
-
return await response.json()
|
52
|
-
else:
|
53
|
-
print(f"Error {response.status} for URL: {url}")
|
54
|
-
return None
|
55
|
-
except Exception as e:
|
56
|
-
print(f"Error fetching {url}: {str(e)}")
|
57
|
-
return None
|
58
|
-
|
59
|
-
def _build_url(self, params):
|
60
|
-
taxonomy = params.get('taxonomy')
|
61
|
-
concept = params.get('concept')
|
62
|
-
unit = params.get('unit')
|
63
|
-
period = params.get('period')
|
64
|
-
|
65
|
-
if not all([taxonomy, concept, unit, period]):
|
66
|
-
raise ValueError("Missing required parameters")
|
67
|
-
|
68
|
-
return f"{self.base_url}/{taxonomy}/{concept}/{unit}/{period}.json"
|
69
|
-
|
70
|
-
async def _get_xbrl_data(self, params_list):
|
71
|
-
tasks = []
|
72
|
-
urls = {}
|
73
|
-
|
74
|
-
for params in params_list:
|
75
|
-
url = self._build_url(params)
|
76
|
-
urls[url] = params
|
77
|
-
tasks.append(self._fetch_json(url))
|
78
|
-
|
79
|
-
results = await asyncio.gather(*tasks)
|
80
|
-
|
81
|
-
return {url: result for url, result in zip(urls.keys(), results) if result is not None}
|
82
|
-
|
83
|
-
def get_xbrl_frames(self, params_list):
|
84
|
-
async def _download():
|
85
|
-
async with self as downloader:
|
86
|
-
return await self._get_xbrl_data(params_list)
|
87
|
-
|
88
|
-
return asyncio.run(_download())
|