datamule 1.0.0__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {datamule-1.0.0 → datamule-1.0.2}/PKG-INFO +1 -1
  2. datamule-1.0.2/datamule/book/__init__.py +0 -0
  3. {datamule-1.0.0/datamule → datamule-1.0.2/datamule/book}/book.py +19 -1
  4. datamule-1.0.2/datamule/book/eftsquery.py +127 -0
  5. datamule-1.0.2/datamule/book/xbrl_retriever.py +88 -0
  6. {datamule-1.0.0 → datamule-1.0.2}/datamule/document.py +3 -2
  7. {datamule-1.0.0 → datamule-1.0.2}/datamule.egg-info/PKG-INFO +1 -1
  8. {datamule-1.0.0 → datamule-1.0.2}/datamule.egg-info/SOURCES.txt +4 -1
  9. {datamule-1.0.0 → datamule-1.0.2}/setup.py +1 -1
  10. {datamule-1.0.0 → datamule-1.0.2}/datamule/__init__.py +0 -0
  11. {datamule-1.0.0 → datamule-1.0.2}/datamule/config.py +0 -0
  12. {datamule-1.0.0 → datamule-1.0.2}/datamule/data/company_former_names.csv +0 -0
  13. {datamule-1.0.0 → datamule-1.0.2}/datamule/data/company_metadata.csv +0 -0
  14. {datamule-1.0.0 → datamule-1.0.2}/datamule/data/company_tickers.csv +0 -0
  15. {datamule-1.0.0 → datamule-1.0.2}/datamule/data/sec-glossary.csv +0 -0
  16. {datamule-1.0.0 → datamule-1.0.2}/datamule/data/xbrl_descriptions.csv +0 -0
  17. {datamule-1.0.0 → datamule-1.0.2}/datamule/downloader/downloader.py +0 -0
  18. {datamule-1.0.0 → datamule-1.0.2}/datamule/downloader/premiumdownloader.py +0 -0
  19. {datamule-1.0.0 → datamule-1.0.2}/datamule/helper.py +0 -0
  20. {datamule-1.0.0 → datamule-1.0.2}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  21. {datamule-1.0.0 → datamule-1.0.2}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  22. {datamule-1.0.0 → datamule-1.0.2}/datamule/monitor.py +0 -0
  23. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/__init__.py +0 -0
  24. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/helper.py +0 -0
  25. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot.py +0 -0
  26. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/__init__.py +0 -0
  27. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/server.py +0 -0
  28. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -0
  29. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -0
  30. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -0
  31. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -0
  32. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -0
  33. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/static/scripts/main.js +0 -0
  34. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -0
  35. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -0
  36. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -0
  37. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -0
  38. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -0
  39. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/search.py +0 -0
  40. {datamule-1.0.0 → datamule-1.0.2}/datamule/mulebot/tools.py +0 -0
  41. {datamule-1.0.0 → datamule-1.0.2}/datamule/packageupdater.py +0 -0
  42. {datamule-1.0.0 → datamule-1.0.2}/datamule/portfolio.py +0 -0
  43. {datamule-1.0.0 → datamule-1.0.2}/datamule/submission.py +0 -0
  44. {datamule-1.0.0 → datamule-1.0.2}/datamule.egg-info/dependency_links.txt +0 -0
  45. {datamule-1.0.0 → datamule-1.0.2}/datamule.egg-info/requires.txt +0 -0
  46. {datamule-1.0.0 → datamule-1.0.2}/datamule.egg-info/top_level.txt +0 -0
  47. {datamule-1.0.0 → datamule-1.0.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
File without changes
@@ -6,8 +6,26 @@
6
6
  # likely new bottleneck will be local parsing() - will be bypassed in future when we have parsed archive
7
7
  # wow parsed archive is going to be crazy fast - like every 10k in 1 minute.
8
8
 
9
+ # example queries filter by sic = 7372, xbrl query = dei:operatingprofit > 0 in date range 2018-2019
10
+
11
+ # hmm do we go for sql esq or not.
12
+ # I think we do.
13
+ # i think we remove cik, ticker, sic, etc and just have a query object
14
+ # should be sql esq so users can use it easily w/o learnign new syntax
15
+
16
+ # WHERE submission_type = '10-K'
17
+ # AND us-gaap:ResearchAndDevelopmentExpense > 0
18
+ # AND dei:debt_to_equity < 2
19
+ # AND filing_date BETWEEN '2023-01-01' AND '2023-12-31'
20
+ # AND CIK in (123, 456, 789)
21
+ # AND SIC in (123, 456, 789)
22
+ # AND ticker in ('AAPL', 'GOOGL', 'AMZN')
23
+ # AND document_type = 'EX-99.1' # to select attachments
24
+
25
+ from .eftsquery import EFTSQuery
26
+
27
+
9
28
  class Book():
10
- pass
11
29
  def process_submissions(self,cik,ticker,sic,submission_type,document_type,date,
12
30
  xbrl_query={},
13
31
  metadata_callback=None,
@@ -0,0 +1,127 @@
1
+ import asyncio
2
+ import aiohttp
3
+ from tqdm import tqdm
4
+ from datetime import datetime
5
+ from urllib.parse import urlencode
6
+ import time
7
+
8
+ class PreciseRateLimiter:
9
+ def __init__(self, rate=10, interval=1.0):
10
+ self.rate = rate # requests per interval
11
+ self.interval = interval # in seconds
12
+ self.token_time = self.interval / self.rate # time per token
13
+ self.last_time = time.time()
14
+ self.lock = asyncio.Lock()
15
+
16
+ async def acquire(self):
17
+ async with self.lock:
18
+ now = time.time()
19
+ wait_time = self.last_time + self.token_time - now
20
+ if wait_time > 0:
21
+ await asyncio.sleep(wait_time)
22
+ self.last_time = time.time()
23
+ return True
24
+
25
+ class EFTSQuery:
26
+ def __init__(self):
27
+ self.headers = {
28
+ 'User-Agent': 'Your Name yourname@email.com',
29
+ 'Accept-Encoding': 'gzip, deflate',
30
+ 'Host': 'efts.sec.gov'
31
+ }
32
+ self.session = None
33
+ self.limiter = PreciseRateLimiter(10)
34
+
35
+ async def __aenter__(self):
36
+ if not self.session:
37
+ self.session = aiohttp.ClientSession(headers=self.headers)
38
+ return self
39
+
40
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
41
+ if self.session:
42
+ await self.session.close()
43
+ self.session = None
44
+
45
+ async def _fetch_json(self, url):
46
+ await self.limiter.acquire()
47
+ try:
48
+ async with self.session.get(url) as response:
49
+ if response.status == 429:
50
+ await asyncio.sleep(61)
51
+ return await self._fetch_json(url)
52
+ return await response.json()
53
+ except Exception as e:
54
+ print(f"Error fetching {url}: {str(e)}")
55
+ return None
56
+
57
+ async def _get_accession_numbers(self, base_url):
58
+ data = await self._fetch_json(f"{base_url}&from=0&size=1")
59
+ if not data or 'hits' not in data:
60
+ return []
61
+
62
+ total_hits = data['hits']['total']['value']
63
+ if not total_hits:
64
+ return []
65
+
66
+ accession_numbers = []
67
+ start = 0
68
+ page_size = 100
69
+ batch_size = 10 # Number of concurrent requests
70
+
71
+ with tqdm(total=total_hits) as pbar:
72
+ while start < total_hits:
73
+ tasks = []
74
+ for i in range(batch_size):
75
+ if start + i * page_size >= total_hits:
76
+ break
77
+ url = f"{base_url}&from={start + i * page_size}&size={page_size}"
78
+ tasks.append(self._fetch_json(url))
79
+
80
+ if not tasks:
81
+ break
82
+
83
+ results = await asyncio.gather(*tasks)
84
+
85
+ for data in results:
86
+ if data and 'hits' in data:
87
+ hits = data['hits']['hits']
88
+ batch_numbers = [
89
+ f"{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}"
90
+ for hit in hits
91
+ ]
92
+ accession_numbers.extend(batch_numbers)
93
+ pbar.update(len(hits))
94
+
95
+ start += batch_size * page_size
96
+
97
+ return accession_numbers
98
+
99
+ def query_efts(self, cik=None, ticker=None, submission_type=None, filing_date=None, search_text=None):
100
+ async def _download():
101
+ async with self as downloader:
102
+ params = {}
103
+
104
+ if cik:
105
+ params['ciks'] = str(cik).zfill(10)
106
+
107
+ if submission_type:
108
+ params['forms'] = ','.join(submission_type) if isinstance(submission_type, list) else submission_type
109
+
110
+ if isinstance(filing_date, list):
111
+ dates = [(d, d) for d in filing_date]
112
+ elif isinstance(filing_date, tuple):
113
+ dates = [filing_date]
114
+ else:
115
+ date_str = filing_date if filing_date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
116
+ start, end = date_str.split(',')
117
+ dates = [(start, end)]
118
+
119
+ params['startdt'], params['enddt'] = dates[0]
120
+
121
+ if search_text:
122
+ params['q'] = f'"{search_text}"'
123
+
124
+ base_url = f"https://efts.sec.gov/LATEST/search-index?{urlencode(params, doseq=True)}"
125
+ return await self._get_accession_numbers(base_url)
126
+
127
+ return asyncio.run(_download())
@@ -0,0 +1,88 @@
1
+ import asyncio
2
+ import aiohttp
3
+ import time
4
+
5
+ class PreciseRateLimiter:
6
+ def __init__(self, rate=10, interval=1.0):
7
+ self.rate = rate
8
+ self.interval = interval
9
+ self.token_time = self.interval / self.rate
10
+ self.last_time = time.time()
11
+ self.lock = asyncio.Lock()
12
+
13
+ async def acquire(self):
14
+ async with self.lock:
15
+ now = time.time()
16
+ wait_time = self.last_time + self.token_time - now
17
+ if wait_time > 0:
18
+ await asyncio.sleep(wait_time)
19
+ self.last_time = time.time()
20
+ return True
21
+
22
+ class XBRLRetriever:
23
+ def __init__(self):
24
+ self.base_url = "https://data.sec.gov/api/xbrl/frames"
25
+ self.headers = {
26
+ 'User-Agent': 'Your Name yourname@email.com',
27
+ 'Accept-Encoding': 'gzip, deflate',
28
+ 'Host': 'data.sec.gov'
29
+ }
30
+ self.session = None
31
+ self.limiter = PreciseRateLimiter(10)
32
+
33
+ async def __aenter__(self):
34
+ if not self.session:
35
+ self.session = aiohttp.ClientSession(headers=self.headers)
36
+ return self
37
+
38
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
39
+ if self.session:
40
+ await self.session.close()
41
+ self.session = None
42
+
43
+ async def _fetch_json(self, url):
44
+ await self.limiter.acquire()
45
+ try:
46
+ async with self.session.get(url) as response:
47
+ if response.status == 429:
48
+ await asyncio.sleep(61)
49
+ return await self._fetch_json(url)
50
+ elif response.status == 200:
51
+ return await response.json()
52
+ else:
53
+ print(f"Error {response.status} for URL: {url}")
54
+ return None
55
+ except Exception as e:
56
+ print(f"Error fetching {url}: {str(e)}")
57
+ return None
58
+
59
+ def _build_url(self, params):
60
+ taxonomy = params.get('taxonomy')
61
+ concept = params.get('concept')
62
+ unit = params.get('unit')
63
+ period = params.get('period')
64
+
65
+ if not all([taxonomy, concept, unit, period]):
66
+ raise ValueError("Missing required parameters")
67
+
68
+ return f"{self.base_url}/{taxonomy}/{concept}/{unit}/{period}.json"
69
+
70
+ async def _get_xbrl_data(self, params_list):
71
+ tasks = []
72
+ urls = {}
73
+
74
+ for params in params_list:
75
+ url = self._build_url(params)
76
+ urls[url] = params
77
+ tasks.append(self._fetch_json(url))
78
+
79
+ results = await asyncio.gather(*tasks)
80
+
81
+ return {url: result for url, result in zip(urls.keys(), results) if result is not None}
82
+
83
+ def get_xbrl_frames(self, params_list):
84
+ async def _download():
85
+ async with self as downloader:
86
+ return await self._get_xbrl_data(params_list)
87
+
88
+ return asyncio.run(_download())
@@ -2,7 +2,7 @@ import json
2
2
  import csv
3
3
  from .helper import convert_to_dashed_accession
4
4
  import re
5
- from doc2dict import xml2dict, txt2dict
5
+ from doc2dict import xml2dict, txt2dict, dict2dict
6
6
  from doc2dict.mapping import flatten_hierarchy
7
7
  from .mapping_dicts import txt_mapping_dicts
8
8
  from .mapping_dicts import xml_mapping_dicts
@@ -126,7 +126,8 @@ class Document:
126
126
  elif self.type == 'SC 13G':
127
127
  mapping_dict = txt_mapping_dicts.dict_13g
128
128
 
129
- self.data = txt2dict(content=self.content, mapping_dict=mapping_dict)
129
+ self.data = {}
130
+ self.data['document'] = dict2dict(txt2dict(content=self.content, mapping_dict=mapping_dict))
130
131
  return self.data
131
132
 
132
133
  def write_json(self, output_filename=None):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -1,6 +1,5 @@
1
1
  setup.py
2
2
  datamule/__init__.py
3
- datamule/book.py
4
3
  datamule/config.py
5
4
  datamule/document.py
6
5
  datamule/helper.py
@@ -13,6 +12,10 @@ datamule.egg-info/SOURCES.txt
13
12
  datamule.egg-info/dependency_links.txt
14
13
  datamule.egg-info/requires.txt
15
14
  datamule.egg-info/top_level.txt
15
+ datamule/book/__init__.py
16
+ datamule/book/book.py
17
+ datamule/book/eftsquery.py
18
+ datamule/book/xbrl_retriever.py
16
19
  datamule/data/company_former_names.csv
17
20
  datamule/data/company_metadata.csv
18
21
  datamule/data/company_tickers.csv
@@ -15,7 +15,7 @@ extras["all"] = list(all_dependencies)
15
15
  setup(
16
16
  name="datamule",
17
17
  author="John Friedman",
18
- version="1.0.0",
18
+ version="1.0.2",
19
19
  description="Making it easier to use SEC filings.",
20
20
  packages=find_namespace_packages(include=['datamule*']),
21
21
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes